diff --git a/Manifest.toml b/Manifest.toml
index 4308840..02641db 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -38,9 +38,9 @@ version = "5.0.7"
 
 [[deps.ArrayLayouts]]
 deps = ["FillArrays", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "8b921542ad44cba67f1487e2226446597e0a90af"
+git-tree-sha1 = "c23473c60476e62579c077534b9643ec400f792b"
 uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
-version = "0.8.5"
+version = "0.8.6"
 
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -176,9 +176,9 @@ version = "1.0.3"
 
 [[deps.DiffRules]]
 deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "dd933c4ef7b4c270aacd4eb88fa64c147492acf0"
+git-tree-sha1 = "28d605d9a0ac17118fe2c5e9ce0fbb76c3ceb120"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.10.0"
+version = "1.11.0"
 
 [[deps.Distances]]
 deps = ["LinearAlgebra", "SparseArrays", "Statistics", "StatsAPI"]
@@ -237,9 +237,9 @@ version = "0.1.1"
 
 [[deps.ForwardDiff]]
 deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "LogExpFunctions", "NaNMath", "Preferences", "Printf", "Random", "SpecialFunctions", "StaticArrays"]
-git-tree-sha1 = "40d1546a45abd63490569695a86a2d93c2021e54"
+git-tree-sha1 = "34e6147e7686a101c245f12dba43b743c7afda96"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "0.10.26"
+version = "0.10.27"
 
 [[deps.FunctionWrappers]]
 git-tree-sha1 = "241552bc2209f0fa068b6415b1942cc0aa486bcc"
@@ -396,9 +396,9 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [[deps.LogExpFunctions]]
 deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
-git-tree-sha1 = "a970d55c2ad8084ca317a4658ba6ce99b7523571"
+git-tree-sha1 = "44a7b7bb7dd1afe12bac119df6a7e540fa2c96bc"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.3.12"
+version = "0.3.13"
 
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -410,9 +410,9 @@ version = "0.4.10"
 
 [[deps.MLUtils]]
 deps = ["ChainRulesCore", "DelimitedFiles", "FLoops", "FoldsThreads", "Random", "ShowCases", "Statistics", "StatsBase"]
-git-tree-sha1 = "32eeb46fa393ae36a4127c9442ade478c8d01117"
+git-tree-sha1 = "202617a5a49a8b5f3b4abf96621f2519b1592c74"
 uuid = "f1d291b0-491e-4a28-83b9-f70985020b54"
-version = "0.2.3"
+version = "0.2.4"
 
 [[deps.MPC_jll]]
 deps = ["Artifacts", "GMP_jll", "JLLWrappers", "Libdl", "MPFR_jll", "Pkg"]
@@ -533,9 +533,9 @@ version = "1.4.1"
 
 [[deps.Parsers]]
 deps = ["Dates"]
-git-tree-sha1 = "3b429f37de37f1fc603cc1de4a799dc7fbe4c0b6"
+git-tree-sha1 = "1285416549ccfcdf0c50d4997a94331e88d68413"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.3.0"
+version = "2.3.1"
 
 [[deps.Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
@@ -658,9 +658,9 @@ version = "0.1.14"
 
 [[deps.Static]]
 deps = ["IfElse"]
-git-tree-sha1 = "2114b1d8517764a8c4625a2e97f40640c7a301a7"
+git-tree-sha1 = "b1f1f60bf4f25d8b374480fb78c7b9785edf95fd"
 uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.6.1"
+version = "0.6.2"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "Statistics"]
diff --git a/Project.toml b/Project.toml
index f4b21a1..7565bfc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,7 +30,9 @@ BesselK = "0.3"
 BlockFactorizations = "1.2.1"
 DiffResults = "1.0"
 FillArrays = "0.12, 0.13"
+Flux = "0.13"
 ForwardDiff = "0.10"
+Functors = "0.2"
 IterativeSolvers = "0.9"
 KroneckerProducts = "1.0"
 LazyArrays = "0.22"
diff --git a/src/.DS_Store b/src/.DS_Store
deleted file mode 100644
index 5008ddf..0000000
Binary files a/src/.DS_Store and /dev/null differ
diff --git a/src/barneshut.jl b/src/barneshut.jl
index 0e66749..e46e0d3 100644
--- a/src/barneshut.jl
+++ b/src/barneshut.jl
@@ -30,7 +30,7 @@ function BarnesHutFactorization(k, x, y = x, D = nothing; θ::Real = 1/4, leafsi
     # w = zeros(length(m))
     # i = zeros(Bool, m)
     # WT, BT = typeof(w), typeof(i)
-    T = gramian_eltype(k, xs, ys)
+    T = gramian_eltype(k, xs[1], ys[1])
     BarnesHutFactorization{T, KT, XT, YT, TT, DT, RT}(k, xs, ys, Tree, D, θ) #, w, i)
 end
 function BarnesHutFactorization(G::Gramian, θ::Real = 1/2; leafsize::Int = BARNES_HUT_DEFAULT_LEAFSIZE)
@@ -49,7 +49,7 @@ function LinearAlgebra.mul!(y::AbstractVector, F::BarnesHutFactorization, x::Abs
         taylor!(y, F, x, α, β)
     end
 end
-function Base.:*(F::BarnesHutFactorization, x::AbstractVector)
+function Base.:*(F::BarnesHutFactorization{<:Number}, x::AbstractVector{<:Number})
     T = promote_type(eltype(F), eltype(x))
     y = zeros(T, size(F, 1))
     mul!(y, F, x)
@@ -148,45 +148,45 @@ end
 
 ############################# centers of mass ##################################
 # this is a weighted sum, could be generalized to incorporate node_sums
-function compute_centers_of_mass(x::AbstractVector, w::AbstractVector, T::BallTree)
+function compute_centers_of_mass(w::AbstractVector, x::AbstractVector, T::BallTree)
     D = eltype(x) <: StaticVector ? length(eltype(x)) : length(x[1]) # if x is static vector
     com = [zero(MVector{D, Float64}) for _ in 1:length(T.hyper_spheres)]
-    compute_centers_of_mass!(com, x, w, T)
+    compute_centers_of_mass!(com, w, x, T)
 end
 
 function compute_centers_of_mass(F::BarnesHutFactorization, w::AbstractVector)
-    compute_centers_of_mass(F.y, w, F.Tree)
+    compute_centers_of_mass(w, F.y, F.Tree)
 end
 
-function compute_centers_of_mass!(com::AbstractVector, x::AbstractVector, w::AbstractVector, T::BallTree)
+function compute_centers_of_mass!(com::AbstractVector, w::AbstractVector, x::AbstractVector, T::BallTree)
     abs_w = abs.(w)
-    weighted_node_sums!(com, x, abs_w, T)
+    weighted_node_sums!(com, abs_w, x, T)
     sum_w = node_sums(abs_w, T)
     ε = eps(eltype(w)) # ensuring division by zero it not a problem
     @. com ./= sum_w + ε
 end
 
-node_sums(x::AbstractVector, T::BallTree) = weighted_node_sums(x, Ones(length(x)), T)
+node_sums(x::AbstractVector, T::BallTree) = weighted_node_sums(Ones(length(x)), x, T)
 function node_sums!(sums, x::AbstractVector, T::BallTree)
-    weighted_node_sums!(sums, x, Ones(length(x)), T)
+    weighted_node_sums!(sums, Ones(length(x)), x, T)
 end
 
-function weighted_node_sums(x::AbstractVector, w::AbstractVector, T::BallTree, index::Int = 1)
+function weighted_node_sums(w::AbstractVector, x::AbstractVector, T::BallTree, index::Int = 1)
     length(x) == 0 && return zero(eltype(x))
-    sums = zeros(typeof(w[1]'x[1]), length(T.hyper_spheres))
-    weighted_node_sums!(sums, x, w, T)
+    sums = fill(zero(w[1]'x[1]), length(T.hyper_spheres))
+    weighted_node_sums!(sums, w, x, T)
 end
 
 # NOTE: x should either be vector of numbers or vector of static arrays
-function weighted_node_sums!(sums::AbstractVector, x::AbstractVector,
-                            w::AbstractVector{<:Number}, T::BallTree, index::Int = 1)
+function weighted_node_sums!(sums::AbstractVector, w::AbstractVector,
+                            x::AbstractVector, T::BallTree, index::Int = 1)
     if isleaf(T.tree_data.n_internal_nodes, index)
         i = get_leaf_range(T.tree_data, index)
         wi, xi = @views w[T.indices[i]], x[T.indices[i]]
         sums[index] = wi'xi
     else
-        task = @spawn weighted_node_sums!(sums, x, w, T, getleft(index))
-        weighted_node_sums!(sums, x, w, T, getright(index))
+        task = @spawn weighted_node_sums!(sums, w, x, T, getleft(index))
+        weighted_node_sums!(sums, w, x, T, getright(index))
         wait(task)
         sums[index] = sums[getleft(index)] + sums[getright(index)]
     end
diff --git a/src/gradient.jl b/src/gradient.jl
index ad88caf..fe83333 100644
--- a/src/gradient.jl
+++ b/src/gradient.jl
@@ -11,7 +11,7 @@ struct GradientKernel{T, K, IT<:InputTrait} <: MultiKernel{T}
 end
 GradientKernel(k::AbstractKernel{T}) where T = GradientKernel{T, typeof(k)}(k)
 GradientKernel(k) = GradientKernel{Float64, typeof(k)}(k)
-input_trait(G::GradientKernel) = G.input_trait
+@inline input_trait(G::GradientKernel) = G.input_trait
 # gramian_eltype(G::GradientKernel) = eltype(G.k)
 Base.eltype(G::GradientKernel) = eltype(G.k)
 
@@ -20,15 +20,15 @@ function elsize(G::Gramian{<:AbstractMatrix, <:GradientKernel}, i::Int)
 end
 
 function (G::GradientKernel)(x::AbstractVector, y::AbstractVector)
-    gradient_kernel(G.k, x, y, input_trait(G.k))
+    gradient_kernel(G.k, x, y, input_trait(G))
 end
 
 # need this to work with BlockFactorizations, see blockmul! in gramian
-function evaluate_block!(K::AbstractMatOrFac, G::GradientKernel,
-            x::AbstractVector, y::AbstractVector, T::InputTrait = input_trait(G.k))
+function evaluate_block!(K, G::GradientKernel, x::AbstractVector, y::AbstractVector, T::InputTrait = input_trait(G.k))
     gradient_kernel!(K, G.k, x, y, T)
 end
 
+# necessary?
 function gradient_kernel(k, x::AbstractVector, y::AbstractVector, T::InputTrait) #::GenericInput = GenericInput())
     K = allocate_gradient_kernel(k, x, y, T)
     gradient_kernel!(K, k, x, y, T)
@@ -42,7 +42,7 @@ function allocate_gradient_kernel(k, x, y, ::GenericInput)
 end
 
 function gradient_kernel!(K::AbstractMatrix, k, x::AbstractVector, y::AbstractVector, ::GenericInput)
-    value = similar(x)
+    value = zeros(eltype(K), length(x)) # necessary?
     derivs = K # jacobian (higher order terms)
     result = DiffResults.DiffResult(value, K)
     g(y) = ForwardDiff.gradient(z->k(z, y), x)
@@ -50,51 +50,144 @@ function gradient_kernel!(K::AbstractMatrix, k, x::AbstractVector, y::AbstractVe
     return K
 end
 
-function allocate_gradient_kernel(k, x, y, T::DotProductInput)
-    u = y isa SVector ? MVector(y) : zero(y)
-    U = reshape(u, :, 1)
-    v = x isa SVector ? MVector(x) : zero(x)
-    V = reshape(v, :, 1)'
-    a = x isa SVector ? MVector(x) : zero(x)
-    A = Diagonal(a)
-    C = MMatrix{1, 1}(k(x, y))
-    W = Woodbury(A, U, C, V) # needs to be separate from IsotropicInput,
+# specialized lazy representation of element of gradient kernel gramian
+# increases performance dramatically, compared to generic Woodbury implementation
+struct GradientKernelElement{T, K, X, Y, IT} <: Factorization{T}
+    k::K
+    x::X
+    y::Y
+    input_trait::IT
 end
 
-# NOTE: assumes K is allocated with allocate_gradient_kernel
-function gradient_kernel!(K::Woodbury, k, x::AbstractVector, y::AbstractVector, ::DotProductInput)
-    d² = dot(x, y)
-    k1, k2 = derivative_laplacian(k, d²)
-    @. K.A.diag = k1
-    @. K.U = y
-    @. K.C = k2
-    @. K.V = x'
-    return K
+Base.size(K::GradientKernelElement) = (length(K.x), length(K.y))
+Base.size(K::GradientKernelElement, i::Int) = 1 ≤ i ≤ 2 ? size(K)[i] : 1
+Base.eltype(K::GradientKernelElement{T}) where T = T
+# gradient kernel element only used for sparsely representable elements
+# Base.Matrix(K::GradientKernelElement) = Matrix(Woodbury(K))
+Base.Matrix(K::GradientKernelElement) =  K * I(size(K, 1))
+
+function Base.:*(G::GradientKernelElement, a)
+    T = promote_type(eltype(G), eltype(a))
+    b  = zeros(T, size(a))
+    mul!(b, G, a)
 end
 
-function allocate_gradient_kernel(k, x, y, T::IsotropicInput)
-    r = reshape(x - y, :, 1) # do these work without reshape?
-    if r isa SVector
-        r = MVector(r)
-    end
-    kxy = k(x, y)
+const GenericGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, <:GenericInput}
+
+const IsotropicGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, IsotropicInput}
+function IsotropicGradientKernelElement{T}(k, x, y) where T
+    IsotropicGradientKernelElement{T, typeof(k), typeof(x), typeof(y)}(k, x, y, IsotropicInput())
+end
+
+# isotropic kernel
+function LinearAlgebra.mul!(b, G::IsotropicGradientKernelElement, a, α::Number = 1, β::Number = 0) #, ::IsotropicInput = G.input_trait)
+    r = difference(G.x, G.y) # other implementation appears to be faster for d = 128 since r does not have to be computed twice?
+    r² = sum(abs2, r)
+    k1, k2 = derivative_laplacian(G.k, r²) # IDEA: these could be computed once and stored in G
+    dot_r_a = r'a # dot(r, a)
+    @. b = α * -2(k1 * a + 2*k2 * r * dot_r_a) + β * b
+end
+
+# sparse but not completely lazy representation
+function WoodburyFactorizations.Woodbury(K::IsotropicGradientKernelElement)
+    k, x, y = K.k, K.x, K.y
+    r = x - y
+    r = reshape(r, :, 1) # do these work without reshape?
+    r² = sum(abs2, r)
     d = length(x)
-    # IDEA: have mutable Fill, since this is all we need, will save further allocations, check mutable_fill.jl in doodles
-    # D = Diagonal(MVector{d, typeof(kxy)}(undef)) # not using Fill here, because value can't be changed
-    D = Diagonal(zeros(typeof(kxy), d)) # not using Fill here, because value can't be changed
-    C = MMatrix{1, 1}(kxy)
-    K = Woodbury(D, r, C, r')
+    k1, k2 = derivative_laplacian(k, r²)
+    D = (-2k1*I)(d)
+    C = MMatrix{1, 1}(-4k2)
+    return K = Woodbury(D, r, C, r')
 end
 
-# NOTE: assumes K is allocated with allocate_gradient_kernel
-function gradient_kernel!(K::Woodbury, k, x::AbstractVector, y::AbstractVector, ::IsotropicInput)
-    r = K.U
-    @. r = x - y
-    d² = sum(abs2, r)
+function allocate_gradient_kernel(k, x, y, ::IsotropicInput)
+    T = gramian_eltype(k, x, y)
+    IsotropicGradientKernelElement{T}(k, x, y)
+end
+
+function gradient_kernel!(K::IsotropicGradientKernelElement, k, x::AbstractVector, y::AbstractVector, ::IsotropicInput)
+    typeof(K)(k, x, y, IsotropicInput())
+end
+
+function gradient_kernel(k, x::AbstractVector, y::AbstractVector, ::IsotropicInput)
+    allocate_gradient_kernel(k, x, y, IsotropicInput())
+end
+
+const DotProductGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, DotProductInput}
+function DotProductGradientKernelElement{T}(k, x, y) where T
+    DotProductGradientKernelElement{T, typeof(k), typeof(x), typeof(y)}(k, x, y, DotProductInput())
+end
+function LinearAlgebra.mul!(b, K::DotProductGradientKernelElement, a, α::Number = 1, β::Number = 0)
+    k, x, y = K.k, K.x, K.y
+    d² = dot(x, y)
     k1, k2 = derivative_laplacian(k, d²)
-    @. K.A.diag = -2k1
-    K.C[1, 1] = -4k2
-    return K
+    dot_x_a = x'a # dot(x, a)
+    @. b = α * (k1 * a + k2 * y * dot_x_a) + β * b
+end
+
+# sparse but not completely lazy representation
+function WoodburyFactorizations.Woodbury(K::DotProductGradientKernelElement)
+    k, x, y = K.k, K.x, K.y
+    d² = dot(x, y)
+    k1, k2 = derivative_laplacian(k, d²)
+    D = (k1*I)(length(x))
+    C = MMatrix{1, 1}(k2)
+    return K = Woodbury(D, copy(y), C, copy(x)')
+end
+
+function allocate_gradient_kernel(k, x, y, T::DotProductInput)
+    T = gramian_eltype(k, x, y)
+    DotProductGradientKernelElement{T}(k, x, y)
+end
+
+function gradient_kernel!(K::DotProductGradientKernelElement, k, x::AbstractVector, y::AbstractVector, ::DotProductInput)
+    typeof(K)(k, x, y, DotProductInput())
+end
+
+function gradient_kernel(k, x::AbstractVector, y::AbstractVector, ::DotProductInput)
+    allocate_gradient_kernel(k, x, y, DotProductInput())
+end
+
+
+const LinearFunctionalGradientKernelElement{T, K, X, Y} = GradientKernelElement{T, K, X, Y, StationaryLinearFunctionalInput}
+function LinearFunctionalGradientKernelElement{T}(k, x, y) where T
+    LinearFunctionalGradientKernelElement{T, typeof(k), typeof(x), typeof(y)}(k, x, y, StationaryLinearFunctionalInput())
+end
+
+function LinearAlgebra.mul!(b, K::LinearFunctionalGradientKernelElement, a, α::Number = 1, β::Number = 0)
+    k, x, y = K.k, K.x, K.y
+    r = difference(x, y)
+    cr = dot(k.c, r)
+    k1, k2 = derivative_laplacian(k, cr)
+    dot_c_a = k.c'a # dot(c, a)
+    @. b = α * -k2 * k.c * dot_c_a + β * b
+end
+
+# Base.Matrix(K::LinearFunctionalGradientKernelElement) = Matrix(LazyMatrixProduct(K)) # or more generally: K * I(size(K, 1))
+# sparse but not completely lazy representation
+function LazyMatrixProduct(K::LinearFunctionalGradientKernelElement)
+    k, x, y = K.k, K.x, K.y
+    r = difference(x, y)
+    cr = dot(k.c, r)
+    k1, k2 = derivative_laplacian(k, cr)
+    c = zeros(eltype(k.c), length(x))
+    @. c = k.c
+    c2 = -k2*c
+    return LazyMatrixProduct(c, c2')
+end
+
+function allocate_gradient_kernel(k, x, y, T::StationaryLinearFunctionalInput)
+    T = gramian_eltype(k, x, y)
+    LinearFunctionalGradientKernelElement{T}(k, x, y)
+end
+
+function gradient_kernel!(K::LinearFunctionalGradientKernelElement, k, x::AbstractVector, y::AbstractVector, ::StationaryLinearFunctionalInput)
+    typeof(K)(k, x, y, StationaryLinearFunctionalInput())
+end
+
+function gradient_kernel(k, x::AbstractVector, y::AbstractVector, ::StationaryLinearFunctionalInput)
+    allocate_gradient_kernel(k, x, y, StationaryLinearFunctionalInput())
 end
 
 ############################# Constant Kernel ##################################
@@ -372,7 +465,7 @@ function value_gradient_kernel!(K::DerivativeKernelElement, k, x::AbstractVector
     K.value_value = k(x, y)
     K.value_gradient .= ForwardDiff.gradient(z->k(x, z), y) # ForwardDiff.gradient!(r, z->k(z, y), x)
     K.gradient_value .= ForwardDiff.gradient(z->k(z, y), x)
-    gradient_kernel!(K.gradient_gradient, k, x, y, T) # call GradientKernel for component
+    K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T)
     return K
 end
 
@@ -382,7 +475,7 @@ function value_gradient_kernel!(K::DerivativeKernelElement, k, x::AbstractVector
     K.value_value = k0
     @. K.value_gradient = k1*x # ForwardDiff.gradient(z->k(x, z), y)'
     @. K.gradient_value = k1*y # ForwardDiff.gradient(z->k(z, y), x)
-    gradient_kernel!(K.gradient_gradient, k, x, y, T) # call GradientKernel for component
+    K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T)
     return K
 end
 
@@ -395,7 +488,7 @@ function value_gradient_kernel!(K::DerivativeKernelElement, k, x::AbstractVector
     @. K.value_gradient = r
     @. K.value_gradient *= -2k1 # ForwardDiff.gradient(z->k(x, z), y)'
     @. K.gradient_value *= 2k1 # ForwardDiff.gradient(z->k(z, y), x)
-    gradient_kernel!(K.gradient_gradient, k, x, y, T) # call GradientKernel for component
+    K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T)
     return K
 end
 
@@ -441,18 +534,21 @@ end
 
 ############################# Helpers ##########################################
 # computes value, first, and second derivatives of f at x
+# this works even with nested differentiation
 function value_derivative_laplacian(f, x::Real)
-    f(x), derivative_laplacian(f, x)...
+    fx = f(x)
+    fx, derivative_laplacian(f, x, typeof(fx))...
 end
 # TODO: overwrite for MaternP
-function derivative_laplacian(f, x::Real)
+function derivative_laplacian(f, x::Real, T::DataType = typeof(x))
     g(x) = ForwardDiff.derivative(f, x)
-    value_derivative(g, x)
+    value_derivative(g, x, T)
 end
 
-# WARNING: do not use if nested differentation through value_derivative is happening
-function value_derivative(f, x::Real)
-    r = DiffResults.DiffResult(zero(x), zero(x)) # this could take pre-allocated temporary storage
+# NOTE: if nested differentation through value_derivative is happening,
+# need to pass correct output T = f(x)
+function value_derivative(f, x::Real, T::DataType = typeof(x))
+    r = DiffResults.DiffResult(zero(T), zero(T)) # this could take pre-allocated temporary storage
     r = ForwardDiff.derivative!(r, f, x)
     r.value, r.derivs[1]
 end
diff --git a/src/gradient_algebra.jl b/src/gradient_algebra.jl
index 0bef882..5564096 100644
--- a/src/gradient_algebra.jl
+++ b/src/gradient_algebra.jl
@@ -3,13 +3,15 @@
 # allocates space for gradient kernel evaluation but does not evaluate
 # the separation from evaluation is useful for ValueGradientKernel
 function allocate_gradient_kernel(k::Sum, x, y, ::GenericInput)
-    H = (allocate_gradient_kernel(h, x, y, input_trait(h)) for h in k.args)
-    LazyMatrixSum(H...)
+    H = [allocate_gradient_kernel(h, x, y, input_trait(h)) for h in k.args]
+    LazyMatrixSum(H)
 end
 
+# NOTE: K should be allocated with allocate_gradient_kernel(k, x, y)
 function gradient_kernel!(K::LazyMatrixSum, k::Sum, x::AbstractVector, y::AbstractVector, ::GenericInput)
-    for (h, H) in zip(k.args, K.args)
-        gradient_kernel!(H, h, x, y, input_trait(h))
+    for i in eachindex(k.args)
+        h = k.args[i]
+        K.args[i] = gradient_kernel!(K.args[i], h, x, y, input_trait(h))
     end
     return K
 end
@@ -40,7 +42,7 @@ function gradient_kernel!(W::Woodbury, k::Product, x::AbstractVector, y::Abstrac
         h, H = k.args[i], A.args[i]
         D = H.args[1]
         @. D.diag = k_j
-        gradient_kernel!(H.args[2], h, x, y, input_trait(h))
+        H.args[2] = gradient_kernel!(H.args[2], h, x, y, input_trait(h))
     end
     return W
 end
@@ -136,7 +138,7 @@ function gradient_kernel!(W::Woodbury, k::VerticalRescaling, x, y, ::GenericInpu
     @. A.args[1].diag = fx
     H = A.args[2] # LazyMatrixProduct: first and third are the diagonal scaling matrices, second is the gradient_kernel_matrix of h
     @. A.args[3].diag = fy
-    gradient_kernel!(H, h, x, y, input_trait(h))
+    A.args[2] = gradient_kernel!(H, h, x, y, input_trait(h))
     ForwardDiff.gradient!(@view(W.U[:, 1]), f, x)
     ForwardDiff.gradient!(@view(W.U[:, 2]), z->h(z, y), x)
     ForwardDiff.gradient!(@view(W.V[1, :]), f, y)
@@ -165,7 +167,7 @@ function gradient_kernel!(W::Woodbury, k::Chained, x, y, ::GenericInput)
     f1, f2 = derivative_laplacian(f, h(x, y))
     @. A.args[1].diag = f1
     H = A.args[2] # LazyMatrixProduct: first argument is diagonal scaling, second is the gradient_kernel_matrix of h
-    gradient_kernel!(H, h, x, y, input_trait(h))
+    H.A.args[2] = gradient_kernel!(H, h, x, y, input_trait(h))
     ForwardDiff.gradient!(@view(W.U[:]), z->h(z, y), x)
     ForwardDiff.gradient!(@view(W.V[1, :]), z->h(x, z), y)
     @. W.C = f2
diff --git a/src/gramian.jl b/src/gramian.jl
index 0cfa767..01c8a1c 100644
--- a/src/gramian.jl
+++ b/src/gramian.jl
@@ -17,7 +17,7 @@ end
 const BlockGramian{T, M, K, X, Y} = BlockFactorization{<:T, <:Gramian{M, K, X, Y}}
 
 function Gramian(k, x::AbstractVector, y::AbstractVector = x)
-    T = gramian_eltype(k, x, y)
+    T = gramian_eltype(k, x[1], y[1])
     Gramian{T, typeof(k), typeof(x), typeof(y)}(k, x, y)
 end
 # with euclidean dot product
@@ -31,8 +31,7 @@ elsize(G::Gramian{<:Number}) = ()
 function gramian_eltype(k::MercerKernel, x, y)
     promote_type(eltype(k), eltype(eltype(x)), eltype(eltype(y)))
 end
-gramian_eltype(k, x, y) = typeof(k(x[1], y[1])) # default to evaluation
-
+gramian_eltype(k, x, y) = typeof(k(x, y)) # default to evaluation
 
 # indexing
 # NOTE: @inline helps increase mvm performance by 50%
@@ -226,6 +225,10 @@ function BlockFactorizations.blockmul!(y::AbstractVecOfVecOrMat, G::Gramian, x::
     return y
 end
 
+function LinearAlgebra.mul!(y::AbstractVecOfVecOrMat, G::Gramian, x::AbstractVecOfVecOrMat, α::Real = 1, β::Real = 0)
+    BlockFactorizations.blockmul!(y, G, x, α, β)
+end
+
 # IDEA: @inline?
 function evaluate_block!(Gij, G::Gramian, i::Int, j::Int, T = input_trait(G.k))
     evaluate_block!(Gij, G.k, G.x[i], G.y[j], T)
diff --git a/src/lazy_linear_algebra.jl b/src/lazy_linear_algebra.jl
index b8680be..376ff5e 100644
--- a/src/lazy_linear_algebra.jl
+++ b/src/lazy_linear_algebra.jl
@@ -14,11 +14,12 @@ abstract type LazyFactorization{T} <: Factorization{T} end
 # in that it calculates intermediate results, important to make use of special structure
 # observation: this is remarkably similar to the KroneckerProducts implementation
 # allow AbstractMatOrFacOrUni? will need adjustment for size function
-struct LazyMatrixProduct{T, AT<:Tuple{Vararg{AbstractMatOrFac}}, F} <: LazyFactorization{T}
+struct LazyMatrixProduct{T, AT, F} <: LazyFactorization{T}
     args::AT
     tol::F
     # temporaries
-    function LazyMatrixProduct(args::Tuple; tol::Real = default_tol)
+    # args is vector or tuple of constituent matrices
+    function LazyMatrixProduct(args; tol::Real = default_tol)
         for i in 1:length(args)-1
             size(args[i], 2) == size(args[i+1], 1) || throw(DimensionMismatch("$i"))
         end
@@ -26,7 +27,9 @@ struct LazyMatrixProduct{T, AT<:Tuple{Vararg{AbstractMatOrFac}}, F} <: LazyFacto
         new{T, typeof(args), typeof(tol)}(args, tol)
     end
 end
-LazyMatrixProduct(A::AbstractMatOrFac...; tol::Real = default_tol) = LazyMatrixProduct(A; tol = tol)
+function LazyMatrixProduct(A::AbstractMatOrFac...; tol::Real = default_tol)
+    LazyMatrixProduct([A...]; tol = tol)
+end
 
 function Base.size(L::LazyMatrixProduct, i::Int)
     if i == 1
@@ -80,16 +83,19 @@ end
 ################################################################################
 # in contrast to the AppliedMatrix in LazyArrays, this is not completely lazy
 # in that it calculates intermediate results
-struct LazyMatrixSum{T, AT<:Tuple{Vararg{AbstractMatOrFac}}, F} <: LazyFactorization{T}
+# TODO: should we have a tol field?
+struct LazyMatrixSum{T, AT, F} <: LazyFactorization{T}
     args::AT
     tol::F
-    function LazyMatrixSum(args::Tuple; tol::Real = default_tol)
+    function LazyMatrixSum(args; tol::Real = default_tol)
         all(==(size(args[1])), size.(args)) || throw(DimensionMismatch())
         T = promote_type(eltype.(args)...)
         new{T, typeof(args), typeof(tol)}(args, tol)
     end
 end
-LazyMatrixSum(A::AbstractMatOrFac...; tol::Real = default_tol) = LazyMatrixSum(A, tol = tol)
+function LazyMatrixSum(A::AbstractMatOrFac...; tol::Real = default_tol)
+    LazyMatrixSum([A...], tol = tol)
+end
 
 Base.size(L::LazyMatrixSum, i...) = size(L.args[1], i...)
 Base.adjoint(L::LazyMatrixSum) = LazyMatrixSum(adjoint.(L.args))
diff --git a/src/properties.jl b/src/properties.jl
index 60cb918..b6c6a03 100644
--- a/src/properties.jl
+++ b/src/properties.jl
@@ -30,9 +30,10 @@ isdot(P::Power) = isdot(P.k)
 
 abstract type InputTrait end
 struct GenericInput <: InputTrait end
-struct IsotropicInput <: InputTrait end
-struct DotProductInput <: InputTrait end
-struct StationaryInput <: InputTrait end
+struct IsotropicInput <: InputTrait end # dependent on |r|²
+struct DotProductInput <: InputTrait end # dependent on x ⋅ y
+struct StationaryInput <: InputTrait end # dependent on r
+struct StationaryLinearFunctionalInput <: InputTrait end # dependent on c ⋅ r
 struct PeriodicInput <: InputTrait end
 
 input_trait(::T) where T = GenericInput()
diff --git a/src/stationary.jl b/src/stationary.jl
index 86b724d..9dab7f7 100644
--- a/src/stationary.jl
+++ b/src/stationary.jl
@@ -34,7 +34,7 @@ end
 
 #################### standard exponentiated quadratic kernel ###################
 struct ExponentiatedQuadratic{T} <: IsotropicKernel{T} end
-const EQ = ExponentiatedQuadratic
+const EQ{T} = ExponentiatedQuadratic{T}
 @functor EQ
 EQ() = EQ{Union{}}() # defaults to "bottom" type since it doesn't have any parameters
 (k::EQ)(r²::Number) = exp(-r²/2)
@@ -50,9 +50,6 @@ RQ(α::Real) = RQ{typeof(α)}(α)
 
 (k::RQ)(r²::Number) = (1 + r² / (2*k.α))^-k.α
 
-parameters(k::RQ) = [k.α]
-nparameters(::RQ) = 1
-
 ########################### exponential kernel #################################
 struct Exponential{T} <: IsotropicKernel{T} end
 const Exp = Exponential
@@ -133,10 +130,11 @@ MaternP(k::Matern) = MaternP(floor(Int, k.ν)) # project Matern to closest Mater
 
 (k::MaternP)(r²::Integer) = k(float(r²))
 function (k::MaternP)(r²::Number)
-    r² < 0 && println(r²)
     p = k.p
-    taylor_bound = eps(typeof(r²))^(1/p)
-    if r² < taylor_bound # taylor_bound # around zero, use taylor expansion in r² to avoid singularity in derivative
+    r²_constant = (r² isa Taylor1 ? r².coeffs[1] : r²)
+    taylor_bound = eps(typeof(r²_constant))^(1/p)
+    use_taylor = r²_constant < taylor_bound
+    if use_taylor # taylor_bound # around zero, use taylor expansion in r² to avoid singularity in derivative
         y = one(r²)
         r²i = r²
         for i in 1:p # iterating over r²^i allows for automatic differentiation (even though for a regular float, it would all be zero)
@@ -195,16 +193,19 @@ end
 # it is a valid stationary kernel,
 # because it is the inverse Fourier transform of point measure at μ (delta distribution)
 struct CosineKernel{T, V<:Union{T, AbstractVector{T}}} <: StationaryKernel{T}
-    μ::V
+    c::V
 end
 const Cosine = CosineKernel
 @functor Cosine
 
 # IDEA: trig-identity -> low-rank gramian
 # NOTE: this is the only stationary non-isotropic kernel so far
-(k::CosineKernel)(τ) = cos(2π * dot(k.μ, τ))
-(k::CosineKernel)(x, y) = k(difference(x, y))
-(k::CosineKernel{<:Real, <:Real})(τ) = cos(2π * k.μ * sum(τ))
+input_trait(::CosineKernel) = StationaryLinearFunctionalInput() # dependent on dot(c, τ)
+(k::CosineKernel)(c_dot_τ::Real) = cos(2π * c_dot_τ)
+function (k::CosineKernel)(x, y)
+    τ = difference(x, y)
+    k(dot(k.c, τ))
+end
 
 ####################### spectral mixture kernel ################################
 # can be seen as product kernel of Constant, Cosine, ExponentiatedQuadratic
diff --git a/src/taylor.jl b/src/taylor.jl
index 49bffd0..b180731 100644
--- a/src/taylor.jl
+++ b/src/taylor.jl
@@ -5,8 +5,7 @@
 # Generally, taylor! is ~2x faster and slightly more accurate than splitting_barneshut!
 # when a signficiant degree of compression is taking place (i.e larger θ)
 function taylor!(b::AbstractVector, F::BarnesHutFactorization{<:Number}, w::AbstractVector,
-                    α::Number = 1, β::Number = 0, θ::Real = F.θ;
-                    split::Bool = true, use_com::Bool = true)
+                    α::Number = 1, β::Number = 0, θ::Real = F.θ; use_com::Bool = true)
     size(F, 2) == length(w) || throw(DimensionMismatch("length of w does not match second dimension of F: $(length(w)) ≠ $(size(F, 2))"))
     eltype(b) == promote_type(eltype(F), eltype(w)) ||
             throw(TypeError("eltype of target vector b not equal to eltype of matrix-vector product: $(eltype(b)) and $(promote_type(eltype(F), eltype(w)))"))
@@ -18,7 +17,7 @@ function taylor!(b::AbstractVector, F::BarnesHutFactorization{<:Number}, w::Abst
 
     f_orders(r²) = value_derivative(F.k, r²)
     sums_w = node_sums(w, F.Tree) # IDEA: could pre-allocate
-    sums_w_r = weighted_node_sums(F.y, w, F.Tree)
+    sums_w_r = weighted_node_sums(w, F.y, F.Tree)
     centers = use_com ? compute_centers_of_mass(F, w) : get_hyper_centers(F)
     @. sums_w_r -= sums_w * centers # need to center the moments
     @threads for i in eachindex(F.x) # exactly 4 * length(y) allocations?
diff --git a/test/gradient.jl b/test/gradient.jl
index 63a8e4b..7ea3860 100644
--- a/test/gradient.jl
+++ b/test/gradient.jl
@@ -6,20 +6,19 @@ using BlockFactorizations
 using CovarianceFunctions
 using CovarianceFunctions: EQ, RQ, Dot, ExponentialDot, NN, Matern, MaternP,
         Lengthscale, input_trait, GradientKernel, ValueGradientKernel,
-        DerivativeKernel, ValueDerivativeKernel, DerivativeKernelElement
+        DerivativeKernel, ValueDerivativeKernel, DerivativeKernelElement, Cosine
 
 const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
 
 @testset "gradient kernels" begin
     # nd test
-    n = 5
-    d = 7
-    # n = 2
-    # d = 2
+    n = 2
+    d = 5
     X = randn(d, n) / sqrt(d)
     ε = 1e4eps()
     @testset "GradientKernel" begin
-        kernels = [EQ(), RQ(1.), MaternP(2), Matern(2.7), Dot()^3, ExponentialDot(), NN()]
+        kernels = [MaternP(3), Dot()^3, Cosine(randn(d)), NN()]
+        # kernels = [EQ(), RQ(1.), MaternP(2), Matern(2.7), Dot()^3, ExponentialDot(), NN(), Cosine(randn(d))]
         # kernels = [EQ(), Lengthscale(EQ(), 0.1)] # at this time, test doesn't work because fallback is incorrect
         a = randn(d*n)
         b = randn(d*n)
@@ -37,22 +36,23 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
             # create anonymous wrapper around kernel to trigger generic fallback
             k2 = (x, y)->k(x, y)
             G2 = GradientKernel(k2)
-            # G2(x[1], x[2]) # this takes centuries to precompile, maybe because of nested differentiation?
+
+            # this takes centuries to precompile, maybe because of nested differentiation?
             # not really important because it is a fallback anyway
-            K2 = CovarianceFunctions.gramian(G2, X)
+            K2 = CovarianceFunctions.gramian(G2, X) # NOTE: this is the bottleneck in test suite
             MK2 = Matrix(K2)
             @test MK ≈ MK2 # compare generic and specialized evaluation of gradient kernel
 
             # testing matrix multiply
             Kab = deepcopy(b)
-            α, β = randn(2) # trying out two-argument mul
+            α, β = randn(2) # trying out two-argument mul!
             mul!(Kab, K, a, α, β) # saves some memory, Woodbury still allocates a lot
             MKab = @. α * $*(MK, a) + β * b
             @test Kab ≈ MKab
         end
 
         # testing matrix solve
-        for k in [EQ(), RQ(1.)] # TODO: better conditioned NN kernel with bias
+        for k in kernels # TODO: better conditioned NN kernel with bias
             G = GradientKernel(k)
             K = CovarianceFunctions.gramian(G, X)
             a = randn(n*d)
@@ -64,7 +64,7 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
 
     # TODO:
     @testset "ValueGradientKernel" begin
-        kernels = [Dot(), Dot()^3] #[EQ(), RQ(1.)]#, Dot(), Dot()^3, NN()]
+        kernels = [MaternP(3), Dot()^3, Cosine(randn(d))]
         a = randn((d+1)*n)
         b = randn((d+1)*n)
         for k in kernels
@@ -85,12 +85,16 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
             # create anonymous wrapper around kernel to trigger generic fallback
             k2 = (x, y)->k(x, y)
             G2 = ValueGradientKernel(k2)
-            # G2(x[1], x[2]) # this takes centuries to precompile, maybe because of nested differentiation?
-            # not really important because it is a fallback anyway
-            K2 = CovarianceFunctions.gramian(G2, X)
+            K2 = CovarianceFunctions.gramian(G2, X) # NOTE: compilation here is super slow (not important for real applicaiton since it is fallback)
             MK2 = Matrix(K2)
             @test MK ≈ MK2 # compare generic and specialized evaluation of gradient kernel
 
+            # instead, compare to already tested GradientKernel
+            # K12 = K.A[1, 2]
+            # @test K12 isa DerivativeKernelElement
+            # @test K12.value_value ≈ k(X[:, 1], X[:, 2])
+            # @test Matrix(K12.gradient_gradient) ≈ Matrix(GradientKernel(k)(X[:, 1], X[:, 2]))
+
             # testing matrix multiply
             Kab = deepcopy(b)
             α, β = randn(2) # trying out two-argument mul
diff --git a/test/gradient_algebra.jl b/test/gradient_algebra.jl
index 23187ff..b69e29a 100644
--- a/test/gradient_algebra.jl
+++ b/test/gradient_algebra.jl
@@ -6,7 +6,7 @@ using BlockFactorizations
 using CovarianceFunctions
 using CovarianceFunctions: EQ, RQ, Dot, ExponentialDot, NN, GradientKernel,
         ValueGradientKernel, DerivativeKernel, ValueDerivativeKernel, input_trait,
-        LazyMatrixSum
+        LazyMatrixSum, DotProductGradientKernelElement
 
 const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
 
@@ -43,7 +43,7 @@ end
     K, Kk, Kh = gramian(G, X), gramian(gk, X), gramian(gh, X)
     MK, Mkk, Mkh = Matrix(K), Matrix(Kk), Matrix(Kh)
     @test K isa BlockFactorization
-    @test K.A[1, 1] isa Woodbury # this means it correctly consolidated the sum kernel into one dot product kernel
+    @test K.A[1, 1] isa DotProductGradientKernelElement # this means it correctly consolidated the sum kernel into one dot product kernel
     @test MK ≈ Mkk + Mkh
 
     # sum of heterogeneous kernel types