diff --git a/Manifest.toml b/Manifest.toml index 4308840..02641db 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -38,9 +38,9 @@ version = "5.0.7" [[deps.ArrayLayouts]] deps = ["FillArrays", "LinearAlgebra", "SparseArrays"] -git-tree-sha1 = "8b921542ad44cba67f1487e2226446597e0a90af" +git-tree-sha1 = "c23473c60476e62579c077534b9643ec400f792b" uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" -version = "0.8.5" +version = "0.8.6" [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" @@ -176,9 +176,9 @@ version = "1.0.3" [[deps.DiffRules]] deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"] -git-tree-sha1 = "dd933c4ef7b4c270aacd4eb88fa64c147492acf0" +git-tree-sha1 = "28d605d9a0ac17118fe2c5e9ce0fbb76c3ceb120" uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" -version = "1.10.0" +version = "1.11.0" [[deps.Distances]] deps = ["LinearAlgebra", "SparseArrays", "Statistics", "StatsAPI"] @@ -237,9 +237,9 @@ version = "0.1.1" [[deps.ForwardDiff]] deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "LogExpFunctions", "NaNMath", "Preferences", "Printf", "Random", "SpecialFunctions", "StaticArrays"] -git-tree-sha1 = "40d1546a45abd63490569695a86a2d93c2021e54" +git-tree-sha1 = "34e6147e7686a101c245f12dba43b743c7afda96" uuid = "f6369f11-7733-5829-9624-2563aa707210" -version = "0.10.26" +version = "0.10.27" [[deps.FunctionWrappers]] git-tree-sha1 = "241552bc2209f0fa068b6415b1942cc0aa486bcc" @@ -396,9 +396,9 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[deps.LogExpFunctions]] deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "a970d55c2ad8084ca317a4658ba6ce99b7523571" +git-tree-sha1 = "44a7b7bb7dd1afe12bac119df6a7e540fa2c96bc" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.12" +version = "0.3.13" [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" @@ -410,9 +410,9 @@ version = "0.4.10" [[deps.MLUtils]] deps = ["ChainRulesCore", "DelimitedFiles", "FLoops", "FoldsThreads", "Random", "ShowCases", "Statistics", "StatsBase"] -git-tree-sha1 = "32eeb46fa393ae36a4127c9442ade478c8d01117" +git-tree-sha1 = "202617a5a49a8b5f3b4abf96621f2519b1592c74" uuid = "f1d291b0-491e-4a28-83b9-f70985020b54" -version = "0.2.3" +version = "0.2.4" [[deps.MPC_jll]] deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "MPFR_jll", "Pkg"] @@ -533,9 +533,9 @@ version = "1.4.1" [[deps.Parsers]] deps = ["Dates"] -git-tree-sha1 = "3b429f37de37f1fc603cc1de4a799dc7fbe4c0b6" +git-tree-sha1 = "1285416549ccfcdf0c50d4997a94331e88d68413" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.3.0" +version = "2.3.1" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] @@ -658,9 +658,9 @@ version = "0.1.14" [[deps.Static]] deps = ["IfElse"] -git-tree-sha1 = "2114b1d8517764a8c4625a2e97f40640c7a301a7" +git-tree-sha1 = "b1f1f60bf4f25d8b374480fb78c7b9785edf95fd" uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" -version = "0.6.1" +version = "0.6.2" [[deps.StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] diff --git a/Project.toml b/Project.toml index f4b21a1..7565bfc 100644 --- a/Project.toml +++ b/Project.toml @@ -30,7 +30,9 @@ BesselK = "0.3" BlockFactorizations = "1.2.1" DiffResults = "1.0" FillArrays = "0.12, 0.13" +Flux = "0.13" ForwardDiff = "0.10" +Functors = "0.2" IterativeSolvers = "0.9" KroneckerProducts = "1.0" LazyArrays = "0.22" diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/src/.DS_Store and /dev/null differ diff --git a/src/barneshut.jl b/src/barneshut.jl index 0e66749..e46e0d3 100644 --- a/src/barneshut.jl +++ b/src/barneshut.jl @@ -30,7 +30,7 @@ function BarnesHutFactorization(k, x, y = x, D = nothing; θ::Real = 1/4, leafsi # w = zeros(length(m)) # i = zeros(Bool, m) # WT, BT = typeof(w), typeof(i) - T = gramian_eltype(k, xs, ys) + T = gramian_eltype(k, xs[1], ys[1]) BarnesHutFactorization{T, KT, XT, YT, TT, DT, RT}(k, xs, ys, Tree, D, θ) #, w, i) end function BarnesHutFactorization(G::Gramian, θ::Real = 1/2; leafsize::Int = BARNES_HUT_DEFAULT_LEAFSIZE) @@ -49,7 +49,7 @@ function LinearAlgebra.mul!(y::AbstractVector, F::BarnesHutFactorization, x::Abs taylor!(y, F, x, α, β) end end -function Base.:*(F::BarnesHutFactorization, x::AbstractVector) +function Base.:*(F::BarnesHutFactorization{<:Number}, x::AbstractVector{<:Number}) T = promote_type(eltype(F), eltype(x)) y = zeros(T, size(F, 1)) mul!(y, F, x) @@ -148,45 +148,45 @@ end ############################# centers of mass ################################## # this is a weighted sum, could be generalized to incorporate node_sums -function compute_centers_of_mass(x::AbstractVector, w::AbstractVector, T::BallTree) +function compute_centers_of_mass(w::AbstractVector, x::AbstractVector, T::BallTree) D = eltype(x) <: StaticVector ? length(eltype(x)) : length(x[1]) # if x is static vector com = [zero(MVector{D, Float64}) for _ in 1:length(T.hyper_spheres)] - compute_centers_of_mass!(com, x, w, T) + compute_centers_of_mass!(com, w, x, T) end function compute_centers_of_mass(F::BarnesHutFactorization, w::AbstractVector) - compute_centers_of_mass(F.y, w, F.Tree) + compute_centers_of_mass(w, F.y, F.Tree) end -function compute_centers_of_mass!(com::AbstractVector, x::AbstractVector, w::AbstractVector, T::BallTree) +function compute_centers_of_mass!(com::AbstractVector, w::AbstractVector, x::AbstractVector, T::BallTree) abs_w = abs.(w) - weighted_node_sums!(com, x, abs_w, T) + weighted_node_sums!(com, abs_w, x, T) sum_w = node_sums(abs_w, T) ε = eps(eltype(w)) # ensuring division by zero it not a problem @. com ./= sum_w + ε end -node_sums(x::AbstractVector, T::BallTree) = weighted_node_sums(x, Ones(length(x)), T) +node_sums(x::AbstractVector, T::BallTree) = weighted_node_sums(Ones(length(x)), x, T) function node_sums!(sums, x::AbstractVector, T::BallTree) - weighted_node_sums!(sums, x, Ones(length(x)), T) + weighted_node_sums!(sums, Ones(length(x)), x, T) end -function weighted_node_sums(x::AbstractVector, w::AbstractVector, T::BallTree, index::Int = 1) +function weighted_node_sums(w::AbstractVector, x::AbstractVector, T::BallTree, index::Int = 1) length(x) == 0 && return zero(eltype(x)) - sums = zeros(typeof(w[1]'x[1]), length(T.hyper_spheres)) - weighted_node_sums!(sums, x, w, T) + sums = fill(zero(w[1]'x[1]), length(T.hyper_spheres)) + weighted_node_sums!(sums, w, x, T) end # NOTE: x should either be vector of numbers or vector of static arrays -function weighted_node_sums!(sums::AbstractVector, x::AbstractVector, - w::AbstractVector{<:Number}, T::BallTree, index::Int = 1) +function weighted_node_sums!(sums::AbstractVector, w::AbstractVector, + x::AbstractVector, T::BallTree, index::Int = 1) if isleaf(T.tree_data.n_internal_nodes, index) i = get_leaf_range(T.tree_data, index) wi, xi = @views w[T.indices[i]], x[T.indices[i]] sums[index] = wi'xi else - task = @spawn weighted_node_sums!(sums, x, w, T, getleft(index)) - weighted_node_sums!(sums, x, w, T, getright(index)) + task = @spawn weighted_node_sums!(sums, w, x, T, getleft(index)) + weighted_node_sums!(sums, w, x, T, getright(index)) wait(task) sums[index] = sums[getleft(index)] + sums[getright(index)] end diff --git a/src/gradient.jl b/src/gradient.jl index ad88caf..fe83333 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -11,7 +11,7 @@ struct GradientKernel{T, K, IT<:InputTrait} <: MultiKernel{T} end GradientKernel(k::AbstractKernel{T}) where T = GradientKernel{T, typeof(k)}(k) GradientKernel(k) = GradientKernel{Float64, typeof(k)}(k) -input_trait(G::GradientKernel) = G.input_trait +@inline input_trait(G::GradientKernel) = G.input_trait # gramian_eltype(G::GradientKernel) = eltype(G.k) Base.eltype(G::GradientKernel) = eltype(G.k) @@ -20,15 +20,15 @@ function elsize(G::Gramian{<:AbstractMatrix, <:GradientKernel}, i::Int) end function (G::GradientKernel)(x::AbstractVector, y::AbstractVector) - gradient_kernel(G.k, x, y, input_trait(G.k)) + gradient_kernel(G.k, x, y, input_trait(G)) end # need this to work with BlockFactorizations, see blockmul! in gramian -function evaluate_block!(K::AbstractMatOrFac, G::GradientKernel, - x::AbstractVector, y::AbstractVector, T::InputTrait = input_trait(G.k)) +function evaluate_block!(K, G::GradientKernel, x::AbstractVector, y::AbstractVector, T::InputTrait = input_trait(G.k)) gradient_kernel!(K, G.k, x, y, T) end +# necessary? function gradient_kernel(k, x::AbstractVector, y::AbstractVector, T::InputTrait) #::GenericInput = GenericInput()) K = allocate_gradient_kernel(k, x, y, T) gradient_kernel!(K, k, x, y, T) @@ -42,7 +42,7 @@ function allocate_gradient_kernel(k, x, y, ::GenericInput) end function gradient_kernel!(K::AbstractMatrix, k, x::AbstractVector, y::AbstractVector, ::GenericInput) - value = similar(x) + value = zeros(eltype(K), length(x)) # necessary? derivs = K # jacobian (higher order terms) result = DiffResults.DiffResult(value, K) g(y) = ForwardDiff.gradient(z->k(z, y), x) @@ -50,51 +50,144 @@ function gradient_kernel!(K::AbstractMatrix, k, x::AbstractVector, y::AbstractVe return K end -function allocate_gradient_kernel(k, x, y, T::DotProductInput) - u = y isa SVector ? MVector(y) : zero(y) - U = reshape(u, :, 1) - v = x isa SVector ? MVector(x) : zero(x) - V = reshape(v, :, 1)' - a = x isa SVector ? MVector(x) : zero(x) - A = Diagonal(a) - C = MMatrix{1, 1}(k(x, y)) - W = Woodbury(A, U, C, V) # needs to be separate from IsotropicInput, +# specialized lazy representation of element of gradient kernel gramian +# increases performance dramatically, compared to generic Woodbury implementation +struct GradientKernelElement{T, K, X, Y, IT} <: Factorization{T} + k::K + x::X + y::Y + input_trait::IT end -# NOTE: assumes K is allocated with allocate_gradient_kernel -function gradient_kernel!(K::Woodbury, k, x::AbstractVector, y::AbstractVector, ::DotProductInput) - d² = dot(x, y) - k1, k2 = derivative_laplacian(k, d²) - @. K.A.diag = k1 - @. K.U = y - @. K.C = k2 - @. K.V = x' - return K +Base.size(K::GradientKernelElement) = (length(K.x), length(K.y)) +Base.size(K::GradientKernelElement, i::Int) = 1 ≤ i ≤ 2 ? size(K)[i] : 1 +Base.eltype(K::GradientKernelElement{T}) where T = T +# gradient kernel element only used for sparsely representable elements +# Base.Matrix(K::GradientKernelElement) = Matrix(Woodbury(K)) +Base.Matrix(K::GradientKernelElement) = K * I(size(K, 1)) + +function Base.:*(G::GradientKernelElement, a) + T = promote_type(eltype(G), eltype(a)) + b = zeros(T, size(a)) + mul!(b, G, a) end -function allocate_gradient_kernel(k, x, y, T::IsotropicInput) - r = reshape(x - y, :, 1) # do these work without reshape? - if r isa SVector - r = MVector(r) - end - kxy = k(x, y) +const GenericGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, <:GenericInput} + +const IsotropicGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, IsotropicInput} +function IsotropicGradientKernelElement{T}(k, x, y) where T + IsotropicGradientKernelElement{T, typeof(k), typeof(x), typeof(y)}(k, x, y, IsotropicInput()) +end + +# isotropic kernel +function LinearAlgebra.mul!(b, G::IsotropicGradientKernelElement, a, α::Number = 1, β::Number = 0) #, ::IsotropicInput = G.input_trait) + r = difference(G.x, G.y) # other implementation appears to be faster for d = 128 since r does not have to be computed twice? + r² = sum(abs2, r) + k1, k2 = derivative_laplacian(G.k, r²) # IDEA: these could be computed once and stored in G + dot_r_a = r'a # dot(r, a) + @. b = α * -2(k1 * a + 2*k2 * r * dot_r_a) + β * b +end + +# sparse but not completely lazy representation +function WoodburyFactorizations.Woodbury(K::IsotropicGradientKernelElement) + k, x, y = K.k, K.x, K.y + r = x - y + r = reshape(r, :, 1) # do these work without reshape? + r² = sum(abs2, r) d = length(x) - # IDEA: have mutable Fill, since this is all we need, will save further allocations, check mutable_fill.jl in doodles - # D = Diagonal(MVector{d, typeof(kxy)}(undef)) # not using Fill here, because value can't be changed - D = Diagonal(zeros(typeof(kxy), d)) # not using Fill here, because value can't be changed - C = MMatrix{1, 1}(kxy) - K = Woodbury(D, r, C, r') + k1, k2 = derivative_laplacian(k, r²) + D = (-2k1*I)(d) + C = MMatrix{1, 1}(-4k2) + return K = Woodbury(D, r, C, r') end -# NOTE: assumes K is allocated with allocate_gradient_kernel -function gradient_kernel!(K::Woodbury, k, x::AbstractVector, y::AbstractVector, ::IsotropicInput) - r = K.U - @. r = x - y - d² = sum(abs2, r) +function allocate_gradient_kernel(k, x, y, ::IsotropicInput) + T = gramian_eltype(k, x, y) + IsotropicGradientKernelElement{T}(k, x, y) +end + +function gradient_kernel!(K::IsotropicGradientKernelElement, k, x::AbstractVector, y::AbstractVector, ::IsotropicInput) + typeof(K)(k, x, y, IsotropicInput()) +end + +function gradient_kernel(k, x::AbstractVector, y::AbstractVector, ::IsotropicInput) + allocate_gradient_kernel(k, x, y, IsotropicInput()) +end + +const DotProductGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, DotProductInput} +function DotProductGradientKernelElement{T}(k, x, y) where T + DotProductGradientKernelElement{T, typeof(k), typeof(x), typeof(y)}(k, x, y, DotProductInput()) +end +function LinearAlgebra.mul!(b, K::DotProductGradientKernelElement, a, α::Number = 1, β::Number = 0) + k, x, y = K.k, K.x, K.y + d² = dot(x, y) k1, k2 = derivative_laplacian(k, d²) - @. K.A.diag = -2k1 - K.C[1, 1] = -4k2 - return K + dot_x_a = x'a # dot(x, a) + @. b = α * (k1 * a + k2 * y * dot_x_a) + β * b +end + +# sparse but not completely lazy representation +function WoodburyFactorizations.Woodbury(K::DotProductGradientKernelElement) + k, x, y = K.k, K.x, K.y + d² = dot(x, y) + k1, k2 = derivative_laplacian(k, d²) + D = (k1*I)(length(x)) + C = MMatrix{1, 1}(k2) + return K = Woodbury(D, copy(y), C, copy(x)') +end + +function allocate_gradient_kernel(k, x, y, T::DotProductInput) + T = gramian_eltype(k, x, y) + DotProductGradientKernelElement{T}(k, x, y) +end + +function gradient_kernel!(K::DotProductGradientKernelElement, k, x::AbstractVector, y::AbstractVector, ::DotProductInput) + typeof(K)(k, x, y, DotProductInput()) +end + +function gradient_kernel(k, x::AbstractVector, y::AbstractVector, ::DotProductInput) + allocate_gradient_kernel(k, x, y, DotProductInput()) +end + + +const LinearFunctionalGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, StationaryLinearFunctionalInput} +function LinearFunctionalGradientKernelElement{T}(k, x, y) where T + LinearFunctionalGradientKernelElement{T, typeof(k), typeof(x), typeof(y)}(k, x, y, StationaryLinearFunctionalInput()) +end + +function LinearAlgebra.mul!(b, K::LinearFunctionalGradientKernelElement, a, α::Number = 1, β::Number = 0) + k, x, y = K.k, K.x, K.y + r = difference(x, y) + cr = dot(k.c, r) + k1, k2 = derivative_laplacian(k, cr) + dot_c_a = k.c'a # dot(c, a) + @. b = α * -k2 * k.c * dot_c_a + β * b +end + +# Base.Matrix(K::LinearFunctionalGradientKernelElement) = Matrix(LazyMatrixProduct(K)) # or more generally: K * I(size(K, 1)) +# sparse but not completely lazy representation +function LazyMatrixProduct(K::LinearFunctionalGradientKernelElement) + k, x, y = K.k, K.x, K.y + r = difference(x, y) + cr = dot(k.c, r) + k1, k2 = derivative_laplacian(k, cr) + c = zeros(eltype(k.c), length(x)) + @. c = k.c + c2 = -k2*c + return LazyMatrixProduct(c, c2') +end + +function allocate_gradient_kernel(k, x, y, T::StationaryLinearFunctionalInput) + T = gramian_eltype(k, x, y) + LinearFunctionalGradientKernelElement{T}(k, x, y) +end + +function gradient_kernel!(K::LinearFunctionalGradientKernelElement, k, x::AbstractVector, y::AbstractVector, ::StationaryLinearFunctionalInput) + typeof(K)(k, x, y, StationaryLinearFunctionalInput()) +end + +function gradient_kernel(k, x::AbstractVector, y::AbstractVector, ::StationaryLinearFunctionalInput) + allocate_gradient_kernel(k, x, y, StationaryLinearFunctionalInput()) end ############################# Constant Kernel ################################## @@ -372,7 +465,7 @@ function value_gradient_kernel!(K::DerivativeKernelElement, k, x::AbstractVector K.value_value = k(x, y) K.value_gradient .= ForwardDiff.gradient(z->k(x, z), y) # ForwardDiff.gradient!(r, z->k(z, y), x) K.gradient_value .= ForwardDiff.gradient(z->k(z, y), x) - gradient_kernel!(K.gradient_gradient, k, x, y, T) # call GradientKernel for component + K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T) return K end @@ -382,7 +475,7 @@ function value_gradient_kernel!(K::DerivativeKernelElement, k, x::AbstractVector K.value_value = k0 @. K.value_gradient = k1*x # ForwardDiff.gradient(z->k(x, z), y)' @. K.gradient_value = k1*y # ForwardDiff.gradient(z->k(z, y), x) - gradient_kernel!(K.gradient_gradient, k, x, y, T) # call GradientKernel for component + K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T) return K end @@ -395,7 +488,7 @@ function value_gradient_kernel!(K::DerivativeKernelElement, k, x::AbstractVector @. K.value_gradient = r @. K.value_gradient *= -2k1 # ForwardDiff.gradient(z->k(x, z), y)' @. K.gradient_value *= 2k1 # ForwardDiff.gradient(z->k(z, y), x) - gradient_kernel!(K.gradient_gradient, k, x, y, T) # call GradientKernel for component + K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T) return K end @@ -441,18 +534,21 @@ end ############################# Helpers ########################################## # computes value, first, and second derivatives of f at x +# this works even with nested differentiation function value_derivative_laplacian(f, x::Real) - f(x), derivative_laplacian(f, x)... + fx = f(x) + fx, derivative_laplacian(f, x, typeof(fx))... end # TODO: overwrite for MaternP -function derivative_laplacian(f, x::Real) +function derivative_laplacian(f, x::Real, T::DataType = typeof(x)) g(x) = ForwardDiff.derivative(f, x) - value_derivative(g, x) + value_derivative(g, x, T) end -# WARNING: do not use if nested differentation through value_derivative is happening -function value_derivative(f, x::Real) - r = DiffResults.DiffResult(zero(x), zero(x)) # this could take pre-allocated temporary storage +# NOTE: if nested differentation through value_derivative is happening, +# need to pass correct output T = f(x) +function value_derivative(f, x::Real, T::DataType = typeof(x)) + r = DiffResults.DiffResult(zero(T), zero(T)) # this could take pre-allocated temporary storage r = ForwardDiff.derivative!(r, f, x) r.value, r.derivs[1] end diff --git a/src/gradient_algebra.jl b/src/gradient_algebra.jl index 0bef882..5564096 100644 --- a/src/gradient_algebra.jl +++ b/src/gradient_algebra.jl @@ -3,13 +3,15 @@ # allocates space for gradient kernel evaluation but does not evaluate # the separation from evaluation is useful for ValueGradientKernel function allocate_gradient_kernel(k::Sum, x, y, ::GenericInput) - H = (allocate_gradient_kernel(h, x, y, input_trait(h)) for h in k.args) - LazyMatrixSum(H...) + H = [allocate_gradient_kernel(h, x, y, input_trait(h)) for h in k.args] + LazyMatrixSum(H) end +# NOTE: K should be allocated with allocate_gradient_kernel(k, x, y) function gradient_kernel!(K::LazyMatrixSum, k::Sum, x::AbstractVector, y::AbstractVector, ::GenericInput) - for (h, H) in zip(k.args, K.args) - gradient_kernel!(H, h, x, y, input_trait(h)) + for i in eachindex(k.args) + h = k.args[i] + K.args[i] = gradient_kernel!(K.args[i], h, x, y, input_trait(h)) end return K end @@ -40,7 +42,7 @@ function gradient_kernel!(W::Woodbury, k::Product, x::AbstractVector, y::Abstrac h, H = k.args[i], A.args[i] D = H.args[1] @. D.diag = k_j - gradient_kernel!(H.args[2], h, x, y, input_trait(h)) + H.args[2] = gradient_kernel!(H.args[2], h, x, y, input_trait(h)) end return W end @@ -136,7 +138,7 @@ function gradient_kernel!(W::Woodbury, k::VerticalRescaling, x, y, ::GenericInpu @. A.args[1].diag = fx H = A.args[2] # LazyMatrixProduct: first and third are the diagonal scaling matrices, second is the gradient_kernel_matrix of h @. A.args[3].diag = fy - gradient_kernel!(H, h, x, y, input_trait(h)) + A.args[2] = gradient_kernel!(H, h, x, y, input_trait(h)) ForwardDiff.gradient!(@view(W.U[:, 1]), f, x) ForwardDiff.gradient!(@view(W.U[:, 2]), z->h(z, y), x) ForwardDiff.gradient!(@view(W.V[1, :]), f, y) @@ -165,7 +167,7 @@ function gradient_kernel!(W::Woodbury, k::Chained, x, y, ::GenericInput) f1, f2 = derivative_laplacian(f, h(x, y)) @. A.args[1].diag = f1 H = A.args[2] # LazyMatrixProduct: first argument is diagonal scaling, second is the gradient_kernel_matrix of h - gradient_kernel!(H, h, x, y, input_trait(h)) + H.A.args[2] = gradient_kernel!(H, h, x, y, input_trait(h)) ForwardDiff.gradient!(@view(W.U[:]), z->h(z, y), x) ForwardDiff.gradient!(@view(W.V[1, :]), z->h(x, z), y) @. W.C = f2 diff --git a/src/gramian.jl b/src/gramian.jl index 0cfa767..01c8a1c 100644 --- a/src/gramian.jl +++ b/src/gramian.jl @@ -17,7 +17,7 @@ end const BlockGramian{T, M, K, X, Y} = BlockFactorization{<:T, <:Gramian{M, K, X, Y}} function Gramian(k, x::AbstractVector, y::AbstractVector = x) - T = gramian_eltype(k, x, y) + T = gramian_eltype(k, x[1], y[1]) Gramian{T, typeof(k), typeof(x), typeof(y)}(k, x, y) end # with euclidean dot product @@ -31,8 +31,7 @@ elsize(G::Gramian{<:Number}) = () function gramian_eltype(k::MercerKernel, x, y) promote_type(eltype(k), eltype(eltype(x)), eltype(eltype(y))) end -gramian_eltype(k, x, y) = typeof(k(x[1], y[1])) # default to evaluation - +gramian_eltype(k, x, y) = typeof(k(x, y)) # default to evaluation # indexing # NOTE: @inline helps increase mvm performance by 50% @@ -226,6 +225,10 @@ function BlockFactorizations.blockmul!(y::AbstractVecOfVecOrMat, G::Gramian, x:: return y end +function LinearAlgebra.mul!(y::AbstractVecOfVecOrMat, G::Gramian, x::AbstractVecOfVecOrMat, α::Real = 1, β::Real = 0) + BlockFactorizations.blockmul!(y, G, x, α, β) +end + # IDEA: @inline? function evaluate_block!(Gij, G::Gramian, i::Int, j::Int, T = input_trait(G.k)) evaluate_block!(Gij, G.k, G.x[i], G.y[j], T) diff --git a/src/lazy_linear_algebra.jl b/src/lazy_linear_algebra.jl index b8680be..376ff5e 100644 --- a/src/lazy_linear_algebra.jl +++ b/src/lazy_linear_algebra.jl @@ -14,11 +14,12 @@ abstract type LazyFactorization{T} <: Factorization{T} end # in that it calculates intermediate results, important to make use of special structure # observation: this is remarkably similar to the KroneckerProducts implementation # allow AbstractMatOrFacOrUni? will need adjustment for size function -struct LazyMatrixProduct{T, AT<:Tuple{Vararg{AbstractMatOrFac}}, F} <: LazyFactorization{T} +struct LazyMatrixProduct{T, AT, F} <: LazyFactorization{T} args::AT tol::F # temporaries - function LazyMatrixProduct(args::Tuple; tol::Real = default_tol) + # args is vector or tuple of constituent matrices + function LazyMatrixProduct(args; tol::Real = default_tol) for i in 1:length(args)-1 size(args[i], 2) == size(args[i+1], 1) || throw(DimensionMismatch("$i")) end @@ -26,7 +27,9 @@ struct LazyMatrixProduct{T, AT<:Tuple{Vararg{AbstractMatOrFac}}, F} <: LazyFacto new{T, typeof(args), typeof(tol)}(args, tol) end end -LazyMatrixProduct(A::AbstractMatOrFac...; tol::Real = default_tol) = LazyMatrixProduct(A; tol = tol) +function LazyMatrixProduct(A::AbstractMatOrFac...; tol::Real = default_tol) + LazyMatrixProduct([A...]; tol = tol) +end function Base.size(L::LazyMatrixProduct, i::Int) if i == 1 @@ -80,16 +83,19 @@ end ################################################################################ # in contrast to the AppliedMatrix in LazyArrays, this is not completely lazy # in that it calculates intermediate results -struct LazyMatrixSum{T, AT<:Tuple{Vararg{AbstractMatOrFac}}, F} <: LazyFactorization{T} +# TODO: should we have a tol field? +struct LazyMatrixSum{T, AT, F} <: LazyFactorization{T} args::AT tol::F - function LazyMatrixSum(args::Tuple; tol::Real = default_tol) + function LazyMatrixSum(args; tol::Real = default_tol) all(==(size(args[1])), size.(args)) || throw(DimensionMismatch()) T = promote_type(eltype.(args)...) new{T, typeof(args), typeof(tol)}(args, tol) end end -LazyMatrixSum(A::AbstractMatOrFac...; tol::Real = default_tol) = LazyMatrixSum(A, tol = tol) +function LazyMatrixSum(A::AbstractMatOrFac...; tol::Real = default_tol) + LazyMatrixSum([A...], tol = tol) +end Base.size(L::LazyMatrixSum, i...) = size(L.args[1], i...) Base.adjoint(L::LazyMatrixSum) = LazyMatrixSum(adjoint.(L.args)) diff --git a/src/properties.jl b/src/properties.jl index 60cb918..b6c6a03 100644 --- a/src/properties.jl +++ b/src/properties.jl @@ -30,9 +30,10 @@ isdot(P::Power) = isdot(P.k) abstract type InputTrait end struct GenericInput <: InputTrait end -struct IsotropicInput <: InputTrait end -struct DotProductInput <: InputTrait end -struct StationaryInput <: InputTrait end +struct IsotropicInput <: InputTrait end # dependent on |r|² +struct DotProductInput <: InputTrait end # dependent on x ⋅ y +struct StationaryInput <: InputTrait end # dependent on r +struct StationaryLinearFunctionalInput <: InputTrait end # dependent on c ⋅ r struct PeriodicInput <: InputTrait end input_trait(::T) where T = GenericInput() diff --git a/src/stationary.jl b/src/stationary.jl index 86b724d..9dab7f7 100644 --- a/src/stationary.jl +++ b/src/stationary.jl @@ -34,7 +34,7 @@ end #################### standard exponentiated quadratic kernel ################### struct ExponentiatedQuadratic{T} <: IsotropicKernel{T} end -const EQ = ExponentiatedQuadratic +const EQ{T} = ExponentiatedQuadratic{T} @functor EQ EQ() = EQ{Union{}}() # defaults to "bottom" type since it doesn't have any parameters (k::EQ)(r²::Number) = exp(-r²/2) @@ -50,9 +50,6 @@ RQ(α::Real) = RQ{typeof(α)}(α) (k::RQ)(r²::Number) = (1 + r² / (2*k.α))^-k.α -parameters(k::RQ) = [k.α] -nparameters(::RQ) = 1 - ########################### exponential kernel ################################# struct Exponential{T} <: IsotropicKernel{T} end const Exp = Exponential @@ -133,10 +130,11 @@ MaternP(k::Matern) = MaternP(floor(Int, k.ν)) # project Matern to closest Mater (k::MaternP)(r²::Integer) = k(float(r²)) function (k::MaternP)(r²::Number) - r² < 0 && println(r²) p = k.p - taylor_bound = eps(typeof(r²))^(1/p) - if r² < taylor_bound # taylor_bound # around zero, use taylor expansion in r² to avoid singularity in derivative + r²_constant = (r² isa Taylor1 ? r².coeffs[1] : r²) + taylor_bound = eps(typeof(r²_constant))^(1/p) + use_taylor = r²_constant < taylor_bound + if use_taylor # taylor_bound # around zero, use taylor expansion in r² to avoid singularity in derivative y = one(r²) r²i = r² for i in 1:p # iterating over r²^i allows for automatic differentiation (even though for a regular float, it would all be zero) @@ -195,16 +193,19 @@ end # it is a valid stationary kernel, # because it is the inverse Fourier transform of point measure at μ (delta distribution) struct CosineKernel{T, V<:Union{T, AbstractVector{T}}} <: StationaryKernel{T} - μ::V + c::V end const Cosine = CosineKernel @functor Cosine # IDEA: trig-identity -> low-rank gramian # NOTE: this is the only stationary non-isotropic kernel so far -(k::CosineKernel)(τ) = cos(2π * dot(k.μ, τ)) -(k::CosineKernel)(x, y) = k(difference(x, y)) -(k::CosineKernel{<:Real, <:Real})(τ) = cos(2π * k.μ * sum(τ)) +input_trait(::CosineKernel) = StationaryLinearFunctionalInput() # dependent on dot(c, τ) +(k::CosineKernel)(c_dot_τ::Real) = cos(2π * c_dot_τ) +function (k::CosineKernel)(x, y) + τ = difference(x, y) + k(dot(k.c, τ)) +end ####################### spectral mixture kernel ################################ # can be seen as product kernel of Constant, Cosine, ExponentiatedQuadratic diff --git a/src/taylor.jl b/src/taylor.jl index 49bffd0..b180731 100644 --- a/src/taylor.jl +++ b/src/taylor.jl @@ -5,8 +5,7 @@ # Generally, taylor! is ~2x faster and slightly more accurate than splitting_barneshut! # when a signficiant degree of compression is taking place (i.e larger θ) function taylor!(b::AbstractVector, F::BarnesHutFactorization{<:Number}, w::AbstractVector, - α::Number = 1, β::Number = 0, θ::Real = F.θ; - split::Bool = true, use_com::Bool = true) + α::Number = 1, β::Number = 0, θ::Real = F.θ; use_com::Bool = true) size(F, 2) == length(w) || throw(DimensionMismatch("length of w does not match second dimension of F: $(length(w)) ≠ $(size(F, 2))")) eltype(b) == promote_type(eltype(F), eltype(w)) || throw(TypeError("eltype of target vector b not equal to eltype of matrix-vector product: $(eltype(b)) and $(promote_type(eltype(F), eltype(w)))")) @@ -18,7 +17,7 @@ function taylor!(b::AbstractVector, F::BarnesHutFactorization{<:Number}, w::Abst f_orders(r²) = value_derivative(F.k, r²) sums_w = node_sums(w, F.Tree) # IDEA: could pre-allocate - sums_w_r = weighted_node_sums(F.y, w, F.Tree) + sums_w_r = weighted_node_sums(w, F.y, F.Tree) centers = use_com ? compute_centers_of_mass(F, w) : get_hyper_centers(F) @. sums_w_r -= sums_w * centers # need to center the moments @threads for i in eachindex(F.x) # exactly 4 * length(y) allocations? diff --git a/test/gradient.jl b/test/gradient.jl index 63a8e4b..7ea3860 100644 --- a/test/gradient.jl +++ b/test/gradient.jl @@ -6,20 +6,19 @@ using BlockFactorizations using CovarianceFunctions using CovarianceFunctions: EQ, RQ, Dot, ExponentialDot, NN, Matern, MaternP, Lengthscale, input_trait, GradientKernel, ValueGradientKernel, - DerivativeKernel, ValueDerivativeKernel, DerivativeKernelElement + DerivativeKernel, ValueDerivativeKernel, DerivativeKernelElement, Cosine const AbstractMatOrFac = Union{AbstractMatrix, Factorization} @testset "gradient kernels" begin # nd test - n = 5 - d = 7 - # n = 2 - # d = 2 + n = 2 + d = 5 X = randn(d, n) / sqrt(d) ε = 1e4eps() @testset "GradientKernel" begin - kernels = [EQ(), RQ(1.), MaternP(2), Matern(2.7), Dot()^3, ExponentialDot(), NN()] + kernels = [MaternP(3), Dot()^3, Cosine(randn(d)), NN()] + # kernels = [EQ(), RQ(1.), MaternP(2), Matern(2.7), Dot()^3, ExponentialDot(), NN(), Cosine(randn(d))] # kernels = [EQ(), Lengthscale(EQ(), 0.1)] # at this time, test doesn't work because fallback is incorrect a = randn(d*n) b = randn(d*n) @@ -37,22 +36,23 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization} # create anonymous wrapper around kernel to trigger generic fallback k2 = (x, y)->k(x, y) G2 = GradientKernel(k2) - # G2(x[1], x[2]) # this takes centuries to precompile, maybe because of nested differentiation? + + # this takes centuries to precompile, maybe because of nested differentiation? # not really important because it is a fallback anyway - K2 = CovarianceFunctions.gramian(G2, X) + K2 = CovarianceFunctions.gramian(G2, X) # NOTE: this is the bottleneck in test suite MK2 = Matrix(K2) @test MK ≈ MK2 # compare generic and specialized evaluation of gradient kernel # testing matrix multiply Kab = deepcopy(b) - α, β = randn(2) # trying out two-argument mul + α, β = randn(2) # trying out two-argument mul! mul!(Kab, K, a, α, β) # saves some memory, Woodbury still allocates a lot MKab = @. α * $*(MK, a) + β * b @test Kab ≈ MKab end # testing matrix solve - for k in [EQ(), RQ(1.)] # TODO: better conditioned NN kernel with bias + for k in kernels # TODO: better conditioned NN kernel with bias G = GradientKernel(k) K = CovarianceFunctions.gramian(G, X) a = randn(n*d) @@ -64,7 +64,7 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization} # TODO: @testset "ValueGradientKernel" begin - kernels = [Dot(), Dot()^3] #[EQ(), RQ(1.)]#, Dot(), Dot()^3, NN()] + kernels = [MaternP(3), Dot()^3, Cosine(randn(d))] a = randn((d+1)*n) b = randn((d+1)*n) for k in kernels @@ -85,12 +85,16 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization} # create anonymous wrapper around kernel to trigger generic fallback k2 = (x, y)->k(x, y) G2 = ValueGradientKernel(k2) - # G2(x[1], x[2]) # this takes centuries to precompile, maybe because of nested differentiation? - # not really important because it is a fallback anyway - K2 = CovarianceFunctions.gramian(G2, X) + K2 = CovarianceFunctions.gramian(G2, X) # NOTE: compilation here is super slow (not important for real applicaiton since it is fallback) MK2 = Matrix(K2) @test MK ≈ MK2 # compare generic and specialized evaluation of gradient kernel + # instead, compare to already tested GradientKernel + # K12 = K.A[1, 2] + # @test K12 isa DerivativeKernelElement + # @test K12.value_value ≈ k(X[:, 1], X[:, 2]) + # @test Matrix(K12.gradient_gradient) ≈ Matrix(GradientKernel(k)(X[:, 1], X[:, 2])) + # testing matrix multiply Kab = deepcopy(b) α, β = randn(2) # trying out two-argument mul diff --git a/test/gradient_algebra.jl b/test/gradient_algebra.jl index 23187ff..b69e29a 100644 --- a/test/gradient_algebra.jl +++ b/test/gradient_algebra.jl @@ -6,7 +6,7 @@ using BlockFactorizations using CovarianceFunctions using CovarianceFunctions: EQ, RQ, Dot, ExponentialDot, NN, GradientKernel, ValueGradientKernel, DerivativeKernel, ValueDerivativeKernel, input_trait, - LazyMatrixSum + LazyMatrixSum, DotProductGradientKernelElement const AbstractMatOrFac = Union{AbstractMatrix, Factorization} @@ -43,7 +43,7 @@ end K, Kk, Kh = gramian(G, X), gramian(gk, X), gramian(gh, X) MK, Mkk, Mkh = Matrix(K), Matrix(Kk), Matrix(Kh) @test K isa BlockFactorization - @test K.A[1, 1] isa Woodbury # this means it correctly consolidated the sum kernel into one dot product kernel + @test K.A[1, 1] isa DotProductGradientKernelElement # this means it correctly consolidated the sum kernel into one dot product kernel @test MK ≈ Mkk + Mkh # sum of heterogeneous kernel types