diff --git a/source/dcv/core/algorithm.d b/source/dcv/core/algorithm.d
index 378701c..f948d28 100644
--- a/source/dcv/core/algorithm.d
+++ b/source/dcv/core/algorithm.d
@@ -25,19 +25,21 @@ import std.traits : isNumeric, isFloatingPoint;
 import mir.ndslice.slice : Slice, sliced, DeepElementType;
 import mir.ndslice.algorithm : ndReduce, ndEach, Yes;
 
+import dcv.core.utils : isSlice;
+
 version(LDC)
 {
-    import ldc.intrinsics : sqrt = llvm_sqrt, fabs = llvm_fabs, min = llvm_minnum, max = llvm_maxnum;
+    import ldc.intrinsics : sqrt = llvm_sqrt, fabs = llvm_fabs;
 }
 else
 {
     import std.math : sqrt, fabs;
-    import std.algorithm.comparison : min, max;
 }
 
 version(unittest)
 {
     import std.math : approxEqual;
+    import ldc.intrinsics : min = llvm_minnum, max = llvm_maxnum;
 }
 
 /**
@@ -170,10 +172,14 @@ Returns:
     Scaled input tensor.
     
 */
-@nogc nothrow auto scaled(Scalar, Range, size_t N)(auto ref Slice!(N, Range) tensor, Scalar alpha = 1, Scalar beta = 0)
-        if (isNumeric!Scalar)
+nothrow @nogc auto scaled(Scalar, Tensor)(Tensor tensor, Scalar alpha = 1, Scalar beta = 0) if (isNumeric!Scalar)
+in
+{
+    static assert(isSlice!Tensor, "Input tensor has to be of type mir.ndslice.slice.Slice.");
+}
+body
 {
-    tensor.ndEach!((ref v) { v = alpha * (v) + beta; }, Yes.vectorized);
+    tensor.ndEach!((ref v) { v = cast(DeepElementType!Tensor)(alpha * (v) + beta); }, Yes.vectorized);
     return tensor;
 }
 
@@ -198,21 +204,32 @@ Params:
     maxValue = Maximal value output tensor should contain.
 
 */
-@nogc auto ranged(Scalar, Range, size_t N)(auto ref Slice!(N, Range) tensor,
+nothrow @nogc auto ranged(Scalar, Tensor)(Tensor tensor,
         Scalar minValue = 0, Scalar maxValue = 1) if (isNumeric!Scalar)
+in
+{
+    static assert(isSlice!Tensor, "Input tensor has to be of type mir.ndslice.slice.Slice.");
+}
+body
 {
-    alias RangeElementType = DeepElementType!(typeof(tensor));
+    alias T = DeepElementType!Tensor;
 
-    auto _min = ndReduce!(min, Yes.vectorized)(RangeElementType.max, tensor);
-    static if (isFloatingPoint!RangeElementType)
-        auto _max = ndReduce!(max, Yes.vectorized)(RangeElementType.min_normal, tensor);
+    static if (isFloatingPoint!T)
+    {
+        import ldc.intrinsics : min = llvm_minnum, max = llvm_maxnum;
+        auto _max = ndReduce!(max, Yes.vectorized)(T.min_normal, tensor);
+    }
     else
-        auto _max = ndReduce!(max, Yes.vectorized)(RangeElementType.min, tensor);
+    {
+        import std.algorithm.comparison : min, max;
+        auto _max = ndReduce!(max, Yes.vectorized)(T.min, tensor);
+    }
 
+    auto _min = ndReduce!(min, Yes.vectorized)(T.max, tensor);
     auto rn_val = _max - _min;
     auto sc_val = maxValue - minValue;
 
-    tensor.ndEach!((ref a) { a = sc_val * ((a - _min) / rn_val) + minValue; }, Yes.vectorized);
+    tensor.ndEach!((ref a) { a = cast(T)(sc_val * ((a - _min) / rn_val) + minValue); }, Yes.vectorized);
 
     return tensor;
 }
diff --git a/source/dcv/core/utils.d b/source/dcv/core/utils.d
index eed7e36..291f3a4 100644
--- a/source/dcv/core/utils.d
+++ b/source/dcv/core/utils.d
@@ -23,6 +23,77 @@ static Slice!(N, V*) emptySlice(size_t N, V)() pure @safe nothrow
     return Slice!(N, V*)();
 }
 
+package(dcv) @nogc pure nothrow
+{
+    /**
+       Pack and unpack (N, T*) slices to (N-1, T[M]*).
+    */
+    auto staticPack(size_t CH, size_t N, T)(Slice!(N, T*) slice)
+    {
+        ulong[N-1] shape = slice.shape[0 .. N-1];
+        long[N-1] strides = [slice.structure.strides[0] / CH, slice.structure.strides[1] / CH];
+        T[CH]* ptr = cast(T[CH]*)slice.ptr;
+        return Slice!(N-1, T[CH]*)(shape, strides, ptr);
+    }
+
+    /// ditto
+    auto staticUnpack(size_t CH, size_t N, T)(Slice!(N, T[CH]*) slice)
+    {
+        ulong[N+1] shape = [slice.shape[0], slice.shape[1], CH];
+        long[N+1] strides = [slice.structure.strides[0] * CH, slice.structure.strides[1] * CH, 1];
+        T* ptr = cast(T*)slice.ptr;
+        return Slice!(N+1, T*)(shape, strides, ptr);
+    }
+
+    @safe @nogc nothrow pure auto borders(Shape)(Shape shape, size_t ks)
+    in
+    {
+        static assert(Shape.length == 2, "Non-matrix border extraction is not currently supported.");
+    }
+    body
+    {
+        import std.typecons : Tuple;
+        import std.algorithm.comparison : max;
+        import std.range : iota;
+        import std.traits : ReturnType;
+
+        struct Range(Iota)
+        {
+            Iota rows;
+            Iota cols;
+            @disable this();
+            this(Iota rows, Iota cols)
+            {
+                this.rows = rows;
+                this.cols = cols;
+            }
+        }
+
+        auto range(Iota)(Iota rows, Iota cols)
+        {
+            return Range!Iota(rows, cols);
+        }
+
+        // forced size_t cast...
+        auto _iota(B, E, S)(B begin, E end, S step = 1)
+        {
+            return iota(size_t(begin), size_t(end), size_t(step));
+        }
+
+        size_t kh = max(1, ks / 2);
+        alias Iota = ReturnType!(iota!(size_t, size_t, size_t));
+
+        Range!(Iota)[4] borders = [
+            range(_iota(0, shape[0]), _iota(0, kh)),
+            range(_iota(0, shape[0]), _iota(shape[1] - kh, shape[1])),
+            range(_iota(0, kh), _iota(0, shape[1])),
+            range(_iota(shape[0] - kh, shape[0]), _iota(0, shape[1]))
+        ];
+
+        return borders;
+    }
+}
+
 /**
 Take another typed Slice. Type conversion for the Slice object.
 
@@ -185,43 +256,39 @@ static if (__VERSION__ >= 2071)
 /// Check if given function can perform boundary condition test.
 template isBoundaryCondition(alias bc)
 {
-    import std.typetuple;
+    enum bool isBoundaryCondition = true;
+}
+
+nothrow @nogc pure
+{
 
-    alias Indices = TypeTuple!(int, int);
-    alias bct = bc!(2, float, Indices);
-    static if (isCallable!(bct) && is(Parameters!bct[0] == Slice!(2, float*))
-            && is(Parameters!bct[1] == int) && is(Parameters!bct[2] == int) && is(ReturnType!bct == float))
+    /// $(LINK2 https://en.wikipedia.org/wiki/Neumann_boundary_condition, Neumann) boundary condition test
+    auto neumann(Tensor, Indices...)(Tensor tensor, Indices indices)
+            if (isSlice!Tensor && allSameType!Indices && allSatisfy!(isIntegral, Indices))
     {
-        enum bool isBoundaryCondition = true;
+        immutable N = ReturnType!(Tensor.shape).length;
+        static assert(indices.length == N, "Invalid index dimension");
+        return tensor.bcImpl!_neumann(indices);
     }
-    else
+
+    /// $(LINK2 https://en.wikipedia.org/wiki/Periodic_boundary_conditions,Periodic) boundary condition test
+    auto periodic(Tensor, Indices...)(Tensor tensor, Indices indices)
+            if (isSlice!Tensor && allSameType!Indices && allSatisfy!(isIntegral, Indices))
     {
-        enum bool isBoundaryCondition = false;
+        immutable N = ReturnType!(Tensor.shape).length;
+        static assert(indices.length == N, "Invalid index dimension");
+        return tensor.bcImpl!_periodic(indices);
     }
-}
 
-/// $(LINK2 https://en.wikipedia.org/wiki/Neumann_boundary_condition, Neumann) boundary condition test
-ref T neumann(size_t N, T, Indices...)(ref Slice!(N, T*) slice, Indices indices)
-        if (allSameType!Indices && allSatisfy!(isIntegral, Indices))
-{
-    static assert(indices.length == N, "Invalid index dimension");
-    return slice.bcImpl!_neumann(indices);
-}
-
-/// $(LINK2 https://en.wikipedia.org/wiki/Periodic_boundary_conditions,Periodic) boundary condition test
-ref T periodic(size_t N, T, Indices...)(ref Slice!(N, T*) slice, Indices indices)
-        if (allSameType!Indices && allSatisfy!(isIntegral, Indices))
-{
-    static assert(indices.length == N, "Invalid index dimension");
-    return slice.bcImpl!_periodic(indices);
-}
+    /// Symmetric boundary condition test
+    auto symmetric(Tensor, Indices...)(Tensor tensor, Indices indices)
+            if (isSlice!Tensor && allSameType!Indices && allSatisfy!(isIntegral, Indices))
+    {
+        immutable N = ReturnType!(Tensor.shape).length;
+        static assert(indices.length == N, "Invalid index dimension");
+        return tensor.bcImpl!_symmetric(indices);
+    }
 
-/// Symmetric boundary condition test
-ref T symmetric(size_t N, T, Indices...)(ref Slice!(N, T*) slice, Indices indices)
-        if (allSameType!Indices && allSatisfy!(isIntegral, Indices))
-{
-    static assert(indices.length == N, "Invalid index dimension");
-    return slice.bcImpl!_symmetric(indices);
 }
 
 /// Alias for generalized boundary condition test function.
@@ -264,30 +331,33 @@ unittest
 
 private:
 
-ref auto bcImpl(alias bcfunc, size_t N, T, Indices...)(ref Slice!(N, T*) slice, Indices indices)
+nothrow @nogc pure:
+
+auto bcImpl(alias bcfunc, Tensor, Indices...)(Tensor tensor, Indices indices)
 {
+    immutable N = ReturnType!(Tensor.shape).length;
     static if (N == 1)
     {
-        return slice[bcfunc(cast(int)indices[0], cast(int)slice.length)];
+        return tensor[bcfunc(cast(int)indices[0], cast(int)tensor.length)];
     }
     else static if (N == 2)
     {
-        return slice[bcfunc(cast(int)indices[0], cast(int)slice.length!0),
-            bcfunc(cast(int)indices[1], cast(int)slice.length!1)];
+        return tensor[bcfunc(cast(int)indices[0], cast(int)tensor.length!0),
+            bcfunc(cast(int)indices[1], cast(int)tensor.length!1)];
     }
     else static if (N == 3)
     {
-        return slice[bcfunc(cast(int)indices[0], cast(int)slice.length!0),
-            bcfunc(cast(int)indices[1], cast(int)slice.length!1), bcfunc(cast(int)indices[2],
-                    cast(int)slice.length!2)];
+        return tensor[bcfunc(cast(int)indices[0], cast(int)tensor.length!0),
+            bcfunc(cast(int)indices[1], cast(int)tensor.length!1), bcfunc(cast(int)indices[2],
+                    cast(int)tensor.length!2)];
     }
     else
     {
         foreach (i, ref id; indices)
         {
-            id = bcfunc(cast(int)id, cast(int)slice.shape[i]);
+            id = bcfunc(cast(int)id, cast(int)tensor.shape[i]);
         }
-        return slice[indices];
+        return tensor[indices];
     }
 }
 
diff --git a/source/dcv/features/corner/harris.d b/source/dcv/features/corner/harris.d
index 872c389..72853b8 100644
--- a/source/dcv/features/corner/harris.d
+++ b/source/dcv/features/corner/harris.d
@@ -9,7 +9,12 @@ License: $(LINK3 http://www.boost.org/LICENSE_1_0.txt, Boost Software License -
 */
 module dcv.features.corner.harris;
 
+import std.parallelism : parallel, taskPool, TaskPool;
+
+import ldc.attributes : fastmath;
+
 import mir.ndslice;
+import mir.ndslice.algorithm : ndEach, Yes;
 
 import dcv.core.utils : emptySlice;
 import dcv.imgproc.filter : calcPartialDerivatives;
@@ -28,15 +33,32 @@ Returns:
     Response matrix the same size of the input image, where each pixel represents
     corner response value - the bigger the value, more probably it represents the
     actual corner in the image.
+
+Note:
+    If given, pre-allocated memory has to be contiguous.
  */
 Slice!(2, OutputType*) harrisCorners(InputType, OutputType = InputType)(Slice!(2,
         InputType*) image, in uint winSize = 3, in float k = 0.64f, in float gauss = 0.84f, Slice!(2,
-        OutputType*) prealloc = emptySlice!(2, OutputType))
+        OutputType*) prealloc = emptySlice!(2, OutputType), TaskPool pool = taskPool)
+in
 {
-
-    HarrisDetector det;
-    det.k = k;
-    return calcCorners(image, winSize, gauss, prealloc, det);
+    assert(!image.empty, "Empty image given.");
+    assert(winSize % 2 != 0, "Kernel window size has to be odd.");
+    assert(gauss > 0.0, "Gaussian sigma value has to be greater than 0.");
+    assert(k > 0.0, "K value has to be greater than 0.");
+    if (!prealloc.empty)
+        assert(prealloc.structure.strides[$-1] == 1,
+                "Pre-allocated slice memory is not contiguous.");
+}
+body
+{
+    if (prealloc.shape != image.shape)
+    {
+        prealloc = uninitializedSlice!OutputType(image.shape);
+    }
+    HarrisDetector detector;
+    detector.k = k;
+    return calcCorners(image, winSize, gauss, prealloc, detector, pool);
 }
 
 /**
@@ -52,13 +74,31 @@ Returns:
     Response matrix the same size of the input image, where each pixel represents
     corner response value - the bigger the value, more probably it represents the
     actual corner in the image.
+
+Note:
+    If given, pre-allocated memory has to be contiguous.
  */
 Slice!(2, OutputType*) shiTomasiCorners(InputType, OutputType = InputType)(Slice!(2,
         InputType*) image, in uint winSize = 3, in float gauss = 0.84f, Slice!(2,
-        OutputType*) prealloc = emptySlice!(2, OutputType))
+        OutputType*) prealloc = emptySlice!(2, OutputType), TaskPool pool = taskPool)
+in
 {
-    ShiTomasiDetector det;
-    return calcCorners(image, winSize, gauss, prealloc, det);
+    assert(!image.empty, "Empty image given.");
+    assert(winSize % 2 != 0, "Kernel window size has to be odd.");
+    assert(gauss > 0.0, "Gaussian sigma value has to be greater than 0.");
+    if (!prealloc.empty)
+        assert(prealloc.structure.strides[$-1] == 1,
+                "Pre-allocated slice memory is not contiguous.");
+}
+body
+{
+    if (prealloc.shape != image.shape)
+    {
+        prealloc = uninitializedSlice!OutputType(image.shape);
+    }
+
+    ShiTomasiDetector detector;
+    return calcCorners(image, winSize, gauss, prealloc, detector, pool);
 }
 
 unittest
@@ -109,13 +149,40 @@ unittest
     }
 }
 
+@nogc nothrow @fastmath
+{
+    void calcCornersImpl(Window, Detector)(Window window, Detector detector)
+    {
+        float[3] r = [0.0f, 0.0f, 0.0f];
+        float winSqr = float(window.length!0);
+        winSqr *= winSqr;
+
+        r = ndReduce!sumResponse(r, window);
+
+        r[0] = (r[0] / winSqr) * 0.5f;
+        r[1] /= winSqr;
+        r[2] = (r[2] / winSqr) * 0.5f;
+
+        auto rv = detector(r[0], r[1], r[2]);
+        if (rv > 0)
+            window[$ / 2, $ / 2].corners = rv;
+    }
+
+    float[3] sumResponse(Pack)(float[3] r, Pack pack)
+    {
+        auto gx = pack.fx;
+        auto gy = pack.fy;
+        return [r[0] + gx * gx, r[1] + gx * gy, r[2] + gy * gy];
+    }
+}
+
 private:
 
 struct HarrisDetector
 {
     float k;
 
-    float opCall(float r1, float r2, float r3)
+    @fastmath @nogc nothrow float opCall(float r1, float r2, float r3)
     {
         return (((r1 * r1) - (r2 * r3)) - k * ((r1 + r3) * r1 + r3));
     }
@@ -123,73 +190,27 @@ struct HarrisDetector
 
 struct ShiTomasiDetector
 {
-    float opCall(float r1, float r2, float r3)
+    @fastmath @nogc nothrow float opCall(float r1, float r2, float r3)
     {
-        import std.math : sqrt;
-
+        import ldc.intrinsics : sqrt = llvm_sqrt;
         return ((r1 + r3) - sqrt((r1 - r3) * (r1 - r3) + r2 * r2));
     }
 }
 
 Slice!(2, OutputType*) calcCorners(Detector, InputType, OutputType)(Slice!(2, InputType*) image,
-        uint winSize, float gaussSigma, Slice!(2, OutputType*) prealloc, Detector detector)
+        uint winSize, float gaussSigma, Slice!(2, OutputType*) prealloc, Detector detector, TaskPool pool)
 {
-
-    import std.math : exp, PI;
-    import std.array : uninitializedArray;
-    import std.range : iota;
-    import std.algorithm.iteration : reduce, each;
-    import std.algorithm.comparison : equal;
-
-    assert(!image.empty);
-
-    if (!prealloc.shape[].equal(image.shape[]))
-    {
-        prealloc = uninitializedArray!(OutputType[])(image.shape[].reduce!"a*b").sliced(image.shape);
-    }
-    prealloc[] = cast(OutputType)0;
-
-    auto rows = image.length!0;
-    auto cols = image.length!1;
+    // TODO: implement gaussian weighting!
 
     Slice!(2, InputType*) fx, fy;
     calcPartialDerivatives(image, fx, fy);
 
-    auto winSqr = winSize ^^ 2;
-    auto winHalf = winSize / 2;
+    auto windowPack = assumeSameStructure!("corners", "fx", "fy")(prealloc, fx, fy).windows(winSize, winSize);
 
-    float R; // Score value
-    float w, gx, gy, r1, r2, r3;
-    float gaussMul = 1.0f / (2.0f * PI * gaussSigma);
-    float gaussDel = 2.0f * (gaussSigma ^^ 2);
-
-    foreach (i; winHalf.iota(rows - winHalf))
+    foreach (windowRow; pool.parallel(windowPack))
     {
-        foreach (j; winHalf.iota(cols - winHalf))
-        {
-            r1 = 0.;
-            r2 = 0.;
-            r3 = 0.;
-            for (int cr = cast(int)(i - winHalf); cr < i + winHalf; cr++)
-            {
-                for (int cc = cast(int)(j - winHalf); cc < j + winHalf; cc++)
-                {
-                    w = 1.0f; //gaussMul * exp(-((cast(float)cr - i)^^2 + (cast(float)cc - j)^^2) / gaussDel);
-
-                    gx = fx[cr, cc];
-                    gy = fy[cr, cc];
-                    r1 += w * (gx * gx);
-                    r2 += w * (gx * gy);
-                    r3 += w * (gy * gy);
-                }
-            }
-            r1 = (r1 / winSqr) * 0.5;
-            r2 /= winSqr;
-            r3 = (r3 / winSqr) * 0.5;
-            R = detector(r1, r2, r3);
-            if (R > 0)
-                prealloc[i, j] = cast(OutputType)R;
-        }
+        windowRow.ndEach!(win => calcCornersImpl(win, detector));
     }
+
     return prealloc;
 }
diff --git a/source/dcv/features/rht.d b/source/dcv/features/rht.d
index dbdd8cc..801538e 100644
--- a/source/dcv/features/rht.d
+++ b/source/dcv/features/rht.d
@@ -109,16 +109,23 @@ mixin template BaseRht()
     /// Run RHT using non-zero points in image as edge points.
     auto opCall(T)(Slice!(2, T*) image)
     {
-        Point[] points;
-        foreach (y; 0 .. image.length!0)
-            foreach (x; 0 .. image.length!1)
+        import std.array : appender;
+
+        auto points = appender!(Point[])();
+        int x, y = 0;
+
+        foreach(row; image)
+        {
+            x = 0;
+            foreach(e; row)
             {
-                if (image[y, x] > 0)
-                {
-                    points ~= Point(cast(int)x, cast(int)y);
-                }
+                if (e > 0)
+                    points.put(Point(x, y));
+                ++x;
             }
-        return this.opCall(image, points);
+            ++y;
+        }
+        return this.opCall(image, points.data);
     }
 
     /// Run RHT using prepopullated array of edge points (that may be filtered beforehand).
diff --git a/source/dcv/features/utils.d b/source/dcv/features/utils.d
index 2ea5ed5..404b49c 100644
--- a/source/dcv/features/utils.d
+++ b/source/dcv/features/utils.d
@@ -36,48 +36,46 @@ struct Feature
 /**
 Extract corners as array of 2D points, from response matrix.
 
-params:
-cornerResponse = Response matrix, collected as output from corner
-detection algoritms such as harrisCorners, or shiTomasiCorners.
-count = Number of corners which need to be extracted. Default is
--1 which indicate that all responses with value above the threshold
-will be returned.
-threshold = Response threshold - response values in the matrix
-larger than this are considered as valid corners.
-
-return:
-Dynamic array of size_t[2], as in array of 2D points, of corner reponses 
-which fit the given criteria.
+Params:
+    cornerResponse = Response matrix, collected as output from corner
+    detection algoritms such as harrisCorners, or shiTomasiCorners.
+    count = Number of corners which need to be extracted. Default is
+    -1 which indicate that all responses with value above the threshold
+    will be returned.
+    threshold = Response threshold - response values in the matrix
+    larger than this are considered as valid corners.
+
+Returns:
+    Dynamic array of size_t[2], as in array of 2D points, of corner reponses 
+    which fit the given criteria.
+
+Note:
+    Corner response slice memory has to be contiguous.
 */
-size_t[2][] extractCorners(T)(Slice!(2, T*) cornerResponse, int count = -1, T threshold = cast(T)0)@trusted pure nothrow 
-        if (isNumeric!T)
+pure nothrow auto extractCorners(T)(Slice!(2, T*) cornerResponse, int count = -1, T threshold = 0)
+    if (isNumeric!T)
 {
-    import std.algorithm : sort, map, min;
+    import std.algorithm.sorting : sort;
+    import std.algorithm.iteration : map, filter;
+    import std.range : take;
     import std.array : array;
-    import std.range : take, iota;
-
-    size_t[2][float] features;
 
     if (cornerResponse.empty)
     {
         return null;
     }
 
-    auto rows = cornerResponse.length!0;
-    auto cols = cornerResponse.length!1;
-
-    foreach (r; rows.iota)
-    {
-        foreach (c; cols.iota)
-        {
-            auto res = cornerResponse[r, c];
-            if (res > threshold)
-                features[res] = [r, c];
-        }
-    }
-
-    return sort!"a > b"(features.keys).take(count > 0 ? min(count, features.length) : features.length)
-        .map!(a => features[a]).array;
+    assert(cornerResponse.structure.strides[$-1] == 1,
+            "Corner response slice strides are not contiguous."); // TODO check other dimensions. (use isContiguous)
+
+    return assumeSameStructure!("indices", "value")(indexSlice(cornerResponse.shape), cornerResponse)
+        .byElement
+        .filter!(p => p.value > threshold)
+        .array
+        .sort!((a, b) => a.value > b.value)
+        .take(count)
+        .map!(p => p.indices)
+        .array;
 }
 
 unittest
diff --git a/source/dcv/imgproc/color.d b/source/dcv/imgproc/color.d
index c606088..e5d232a 100644
--- a/source/dcv/imgproc/color.d
+++ b/source/dcv/imgproc/color.d
@@ -34,21 +34,19 @@ luv2rgb -||-
 luv2rgb -||-
 bayer2rgb -||-
 */
+import std.traits : isFloatingPoint, isNumeric;
 
-import std.array : uninitializedArray;
-import std.traits : CommonType, isFloatingPoint, isAssignable, isNumeric;
-import std.algorithm.iteration : sum, each, reduce, map;
-import std.algorithm.mutation : copy;
-import std.algorithm.comparison : equal;
-import std.algorithm : swap;
-import std.range : zip, array, iota;
-import std.exception : enforce;
-import std.range : lockstep;
+import ldc.attributes : fastmath;
 
 import mir.ndslice;
 
 import dcv.core.utils;
 
+version(unittest)
+{
+    import std.algorithm.comparison : equal;
+}
+
 /**
 RGB to Grayscale convertion strategy.
 */
@@ -62,9 +60,9 @@ enum Rgb2GrayConvertion
 Convert RGB image to grayscale.
 
 Params:
-    range = Input image range. Should have 3 channels, represented 
-    as R, G and B respectivelly in that order.
-    prealloc = Pre-allocated range, where grayscale image will be copied. Default
+    input = Input image. Should have 3 channels, represented as R, G and B
+        respectively in that order.
+    prealloc = Pre-allocated buffer, where grayscale image will be copied. Default
     argument is an empty slice, where new data is allocated and returned. If given 
     slice is not of corresponding shape(range.shape[0], range.shape[1]), it is 
     discarded and allocated anew.
@@ -72,15 +70,14 @@ Params:
 
 Returns:
     Returns grayscale version of the given RGB image, of the same size.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(2, V*) rgb2gray(V)(Slice!(3, V*) range, Slice!(2, V*) prealloc = emptySlice!(2, V),
+Slice!(2, V*) rgb2gray(V)(Slice!(3, V*) input, Slice!(2, V*) prealloc = emptySlice!(2, V),
         Rgb2GrayConvertion conv = Rgb2GrayConvertion.LUMINANCE_PRESERVE) pure nothrow
 {
-
-    auto m = rgb2GrayMltp[conv].map!(a => cast(real)a).array;
-    m[] /= cast(real)m.sum;
-
-    return rgb2grayImpl(range, prealloc, m);
+    return rgbbgr2gray!(false, V)(input, prealloc, conv);
 }
 
 unittest
@@ -100,8 +97,8 @@ Same as rgb2gray, but follows swapped channels if luminance preservation
 is chosen as convertion strategy.
 
 Params:
-    range = Input image range. Should have 3 channels, represented 
-    as B, G and R respectivelly in that order.
+    input = Input image. Should have 3 channels, represented as B, G and R
+        respectively in that order.
     prealloc = Pre-allocated range, where grayscale image will be copied. Default
     argument is an empty slice, where new data is allocated and returned. If given 
     slice is not of corresponding shape(range.shape[0], range.shape[1]), it is 
@@ -110,16 +107,14 @@ Params:
 
 Returns:
     Returns grayscale version of the given BGR image, of the same size.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(2, V*) bgr2gray(V)(Slice!(3, V*) range, Slice!(2, V*) prealloc = emptySlice!(2, V),
+Slice!(2, V*) bgr2gray(V)(Slice!(3, V*) input, Slice!(2, V*) prealloc = emptySlice!(2, V),
         Rgb2GrayConvertion conv = Rgb2GrayConvertion.LUMINANCE_PRESERVE) pure nothrow
 {
-
-    auto m = rgb2GrayMltp[conv].map!(a => cast(real)a).array;
-    m[] /= m.sum;
-    m[0].swap(m[2]);
-
-    return rgb2grayImpl(range, prealloc, m);
+    return rgbbgr2gray!(true, V)(input, prealloc, conv);
 }
 
 unittest
@@ -133,6 +128,36 @@ unittest
     assert(equal!approxEqual(gray.byElement, [0, 1, 2, 3]));
 }
 
+private Slice!(2, V*) rgbbgr2gray(bool isBGR, V)(Slice!(3, V*) input, Slice!(2, V*) prealloc = emptySlice!(2, V),
+        Rgb2GrayConvertion conv = Rgb2GrayConvertion.LUMINANCE_PRESERVE) pure nothrow
+in
+{
+    assert(!input.empty, "Input image is empty.");
+}
+body
+{
+    if (prealloc.shape != input.shape[0 .. 2])
+        prealloc = uninitializedSlice!V(input.shape[0 .. 2]);
+
+    auto rgb = staticPack!3(input);
+
+    assert(rgb.structure.strides == prealloc.structure.strides,
+            "Input image and pre-allocated buffer strides are not identical.");
+
+    auto pack = assumeSameStructure!("rgb", "gray")(rgb, prealloc);
+    alias PT = DeepElementType!(typeof(pack));
+
+    if (conv == Rgb2GrayConvertion.MEAN)
+        pack.ndEach!(rgb2grayImplMean!PT);
+    else
+        static if (isBGR)
+            pack.ndEach!(bgr2grayImplLuminance!(PT));
+        else
+            pack.ndEach!(rgb2grayImplLuminance!(PT));
+
+    return prealloc;
+}
+
 /**
 Convert gray image to RGB.
 
@@ -140,7 +165,7 @@ Uses grayscale value and assigns it's value
 to each of three channels for the RGB image version.
 
 Params:
-    range = Grayscale image version, to be converted to the RGB.
+    input = Grayscale image, to be converted to the RGB.
     prealloc = Pre-allocated range, where RGB image will be copied. Default
     argument is an empty slice, where new data is allocated and returned. If given 
     slice is not of corresponding shape(range.shape[0], range.shape[1], 3), it is 
@@ -148,26 +173,31 @@ Params:
 
 Returns:
     Returns RGB version of the given grayscale image.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(3, V*) gray2rgb(V)(Slice!(2, V*) range, Slice!(3, V*) prealloc = emptySlice!(3, V)) pure nothrow
+Slice!(3, V*) gray2rgb(V)(Slice!(2, V*) input, Slice!(3, V*) prealloc = emptySlice!(3, V)) pure nothrow
 {
-
-    if (prealloc.empty || (range.shape[0 .. 2][]).equal(prealloc.shape[0 .. 2][]) || prealloc.shape[2] != 3)
-        prealloc = uninitializedArray!(V[])(range.length!0 * range.length!1 * 3).sliced(range.length!0,
-                range.length!1, 3);
-
-    for (size_t r = 0; r < range.length!0; ++r)
+    Slice!(2, V[3]*) rgb;
+    if (input.shape != prealloc.shape[0 .. 2])
     {
-        for (size_t c = 0; c < range.length!1; ++c)
-        {
-            immutable v = range[r, c];
-            prealloc[r, c, 0] = v;
-            prealloc[r, c, 1] = v;
-            prealloc[r, c, 2] = v;
-        }
+        rgb = uninitializedSlice!(V[3])(input.length!0, input.length!1);
+    }
+    else
+    {
+        rgb = staticPack!3(prealloc);
     }
 
-    return prealloc;
+    assert(rgb.structure.strides == input.structure.strides,
+            "Input and pre-allocated slices' strides are not identical.");
+
+    auto pack = assumeSameStructure!("gray", "rgb")(input, rgb);
+    alias PT = DeepElementType!(typeof(pack));
+
+    pack.ndEach!(gray2rgbImpl!PT);
+
+    return staticUnpack!3(rgb);
 }
 
 unittest
@@ -193,7 +223,7 @@ algorithm to be ranged as 0-255 for ubyte, 0-65535 for ushort,
 and 0-1 for floating point types.
 
 Params:
-    range = RGB image version, which gets converted to HVS.
+    input = RGB image, which gets converted to HSV.
     prealloc = Pre-allocated range, where HSV image will be copied. Default
     argument is an empty slice, where new data is allocated and returned. If given 
     slice is not of corresponding shape(range.shape[0], range.shape[1], 3), it is 
@@ -201,69 +231,28 @@ Params:
 
 Returns:
     Returns HSV verion of the given RGB image.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(3, R*) rgb2hsv(R, V)(Slice!(3, V*) range, Slice!(3, R*) prealloc = emptySlice!(3, R))
+Slice!(3, R*) rgb2hsv(R, V)(Slice!(3, V*) input, Slice!(3, R*) prealloc = emptySlice!(3, R)) pure nothrow
         if (isNumeric!R && isNumeric!V)
+in
 {
-    import std.algorithm.comparison : max, min;
-
     static assert(R.max >= 360, "Invalid output type for HSV (R.max >= 360)");
+    assert(input.length!2 == 3, "Invalid channel count.");
+}
+body
+{
+    if (prealloc.shape != input.shape)
+        prealloc = uninitializedSlice!R(input.shape);
 
-    enforce(range.length!2 == 3, "Invalid channel count.");
-
-    if (prealloc.empty || prealloc.shape[].equal(range.shape[]))
-    {
-        prealloc = uninitializedArray!(R[])(range.length!0 * range.length!1 * 3).sliced(range.length!0,
-                range.length!1, 3);
-    }
-
-    foreach (rgb, hsv; lockstep(range.pack!1.byElement, prealloc.pack!1.byElement))
-    {
-        static if (is(V == ubyte))
-        {
-            auto r = cast(float)(rgb[0]) / 255.;
-            auto g = cast(float)(rgb[1]) / 255.;
-            auto b = cast(float)(rgb[2]) / 255.;
-        }
-        else static if (is(V == ushort))
-        {
-            auto r = cast(float)(rgb[0]) / 65535.;
-            auto g = cast(float)(rgb[1]) / 65535.;
-            auto b = cast(float)(rgb[2]) / 65535.;
-        }
-        else static if (isFloatingPoint!V)
-        {
-            // assumes rgb value range 0-1
-            auto r = cast(float)(rgb[0]);
-            auto g = cast(float)(rgb[1]);
-            auto b = cast(float)(rgb[2]);
-        }
-        else
-        {
-            static assert(0, "Invalid RGB input type: " ~ V.stringof);
-        }
-
-        auto cmax = max(r, max(g, b));
-        auto cmin = min(r, min(g, b));
-        auto cdelta = cmax - cmin;
-
-        hsv[0] = cast(R)((cdelta == 0) ? 0 : (cmax == r) ? 60.0f * ((g - b) / cdelta) : (cmax == g)
-                ? 60.0f * ((b - r) / cdelta + 2) : 60.0f * ((r - g) / cdelta + 4));
+    assert(input.structure.strides == prealloc.structure.strides,
+            "Input image and pre-allocated buffer strides are not identical.");
 
-        if (hsv[0] < 0)
-            hsv[0] += 360;
+    auto pack = assumeSameStructure!("rgb", "hsv")(input.staticPack!3, prealloc.staticPack!3);
+    pack.ndEach!(rgb2hsvImpl!(DeepElementType!(typeof(pack))));
 
-        static if (isFloatingPoint!R)
-        {
-            hsv[1] = cast(R)(cmax == 0 ? 0 : cdelta / cmax);
-            hsv[2] = cast(R)(cmax);
-        }
-        else
-        {
-            hsv[1] = cast(R)(100.0 * (cmax == 0 ? 0 : cdelta / cmax));
-            hsv[2] = cast(R)(100.0 * cmax);
-        }
-    }
     return prealloc;
 }
 
@@ -302,77 +291,35 @@ range RGB values to be 0-255, ushort 0-65535, and floating types
 0.0-1.0. Other types are not supported.
 
 Params:
-    range = RGB image version, which gets converted to HVS.
-    prealloc = Pre-allocated range, where HSV image will be copied. Default
+    input = HSV image, which gets converted to RGB.
+    prealloc = Pre-allocated range, where RGB image will be copied. Default
     argument is an empty slice, where new data is allocated and returned. If given 
     slice is not of corresponding shape(range.shape[0], range.shape[1], 3), it is 
     discarded and allocated anew.
 
 Returns:
-    Returns HSV verion of the given RGB image.
+    Returns RGB verion of the given HSV image.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(3, R*) hsv2rgb(R, V)(Slice!(3, V*) range, Slice!(3, R*) prealloc = emptySlice!(3, R))
+Slice!(3, R*) hsv2rgb(R, V)(Slice!(3, V*) input, Slice!(3, R*) prealloc = emptySlice!(3, R)) pure nothrow
         if (isNumeric!R && isNumeric!V)
+in
 {
-    import std.math : fabs, abs;
-
-    enforce(range.length!2 == 3, "Invalid channel count.");
-
-    if (prealloc.empty || prealloc.shape[].equal(range.shape[]))
-    {
-        prealloc = uninitializedArray!(R[])(range.length!0 * range.length!1 * 3).sliced(range.length!0,
-                range.length!1, 3);
-    }
-
-    float[3] _rgb;
-    immutable hhswitch = [[0, 1, 2], [1, 0, 2], [2, 0, 1], [2, 1, 0], [1, 2, 0], [0, 2, 1]];
-
-    foreach (hsv, rgb; lockstep(range.pack!1.byElement, prealloc.pack!1.byElement))
-    {
-
-        static if (isFloatingPoint!V)
-        {
-            auto h = hsv[0];
-            auto s = hsv[1];
-            auto v = hsv[2];
-        }
-        else
-        {
-            float h = cast(float)hsv[0];
-            float s = cast(float)hsv[1] / 100.0;
-            float v = cast(float)hsv[2] / 100.0;
-        }
+    assert(input.length!2 == 3, "Invalid channel count.");
+}
+body
+{
+    if (prealloc.shape != input.shape)
+        prealloc = uninitializedSlice!R(input.shape);
 
-        float c = v * s;
-        float x = c * (1. - fabs((h / 60.) % 2 - 1));
-        float m = v - c;
+    assert(input.structure.strides == prealloc.structure.strides,
+            "Input image and pre-allocated buffer strides are not identical.");
 
-        int hh = abs(cast(int)(h / 60.) % 6);
-        _rgb = [c, x, 0.];
+    auto pack = assumeSameStructure!("hsv", "rgb")(input.staticPack!3, prealloc.staticPack!3);
+    pack.ndEach!(hsv2rgbImpl!(DeepElementType!(typeof(pack))));
 
-        static if (isFloatingPoint!R)
-        {
-            rgb[0] = cast(R)((_rgb[hhswitch[hh][0]] + m));
-            rgb[1] = cast(R)((_rgb[hhswitch[hh][1]] + m));
-            rgb[2] = cast(R)((_rgb[hhswitch[hh][2]] + m));
-        }
-        else static if (is(R == ubyte))
-        {
-            rgb[0] = cast(R)((_rgb[hhswitch[hh][0]] + m) * 255.);
-            rgb[1] = cast(R)((_rgb[hhswitch[hh][1]] + m) * 255.);
-            rgb[2] = cast(R)((_rgb[hhswitch[hh][2]] + m) * 255.);
-        }
-        else static if (is(R == ushort))
-        {
-            rgb[0] = cast(R)((_rgb[hhswitch[hh][0]] + m) * 65535.);
-            rgb[1] = cast(R)((_rgb[hhswitch[hh][1]] + m) * 65535.);
-            rgb[2] = cast(R)((_rgb[hhswitch[hh][2]] + m) * 65535.);
-        }
-        else
-        {
-            static assert(0, "Output type is not supported: " ~ R.stringof);
-        }
-    }
     return prealloc;
 }
 
@@ -401,7 +348,7 @@ unittest
     hsv2rgbTest(cast(ushort[])[150, 50, 80], cast(ubyte[])[102, 204, 153]);
 
     hsv2rgbTest(cast(float[])[0.0f, 0.0f, 1.0f], cast(ubyte[])[255, 255, 255]);
-    hsv2rgbTest(cast(float[])[150.0f, 0.5f, 1.0f], cast(ubyte[])[128, 255, 191]);
+    hsv2rgbTest(cast(float[])[150.0f, 0.5f, 1.0f], cast(ubyte[])[127, 255, 191]);
     hsv2rgbTest(cast(float[])[150.0f, 0.5f, 0.8f], cast(ubyte[])[102, 204, 153]);
 
     hsv2rgbTest(cast(ushort[])[0, 0, 100], cast(ushort[])[65535, 65535, 65535]);
@@ -419,39 +366,33 @@ Convert RGB image format to YUV.
 YUV images in dcv are organized in the same buffer plane
 where quantity of luma and chroma values are the same (as in
 YUV444 format).
+
+Params:
+    input = Input RGB image.
+    prealloc = Optional pre-allocated buffer. If given, has to be
+        of same shape as input image, otherwise gets reallocated.
+
+Returns:
+    Resulting YUV image slice.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(3, V*) rgb2yuv(V)(Slice!(3, V*) range, Slice!(3, V*) prealloc = emptySlice!(3, V))
+Slice!(3, V*) rgb2yuv(V)(Slice!(3, V*) input, Slice!(3, V*) prealloc = emptySlice!(3, V)) pure nothrow
+in
 {
+    assert(input.length!2 == 3, "Invalid channel count.");
+}
+body
+{
+    if (prealloc.shape != input.shape)
+        prealloc = uninitializedSlice!V(input.shape);
 
-    enforce(range.length!2 == 3, "Invalid channel count.");
-
-    if (prealloc.empty || prealloc.shape[].equal(range.shape[]))
-    {
-        prealloc = uninitializedArray!(V[])(range.length!0 * range.length!1 * 3).sliced(range.length!0,
-                range.length!1, 3);
-    }
+    assert(input.structure.strides == prealloc.structure.strides,
+            "Input image and pre-allocated buffer strides are not identical.");
 
-    foreach (rgb, yuv; lockstep(range.pack!1.byElement, prealloc.pack!1.byElement))
-    {
-        static if (isFloatingPoint!V)
-        {
-            auto r = cast(int)rgb[0];
-            auto g = cast(int)rgb[1];
-            auto b = cast(int)rgb[2];
-            yuv[0] = clip!V((r * .257) + (g * .504) + (b * .098) + 16);
-            yuv[1] = clip!V((r * .439) + (g * .368) + (b * .071) + 128);
-            yuv[2] = clip!V(-(r * .148) - (g * .291) + (b * .439) + 128);
-        }
-        else
-        {
-            auto r = rgb[0];
-            auto g = rgb[1];
-            auto b = rgb[2];
-            yuv[0] = clip!V(((66 * (r) + 129 * (g) + 25 * (b) + 128) >> 8) + 16);
-            yuv[1] = clip!V(((-38 * (r) - 74 * (g) + 112 * (b) + 128) >> 8) + 128);
-            yuv[2] = clip!V(((112 * (r) - 94 * (g) - 18 * (b) + 128) >> 8) + 128);
-        }
-    }
+    auto p = assumeSameStructure!("rgb", "yuv")(input, prealloc).pack!1;
+    p.ndEach!(rgb2yuvImpl!(V, DeepElementType!(typeof(p))));
 
     return prealloc;
 }
@@ -462,38 +403,32 @@ Convert YUV image to RGB.
 As in rgb2yuv conversion, YUV format is considered to have 
 same amount of luma and chroma.
 
-TODO: 
-    Separate input and output type as in rgb2hsv etc.
+Params:
+    input = Input YUV image.
+    prealloc = Optional pre-allocated buffer. If given, has to be
+        of same shape as input image, otherwise gets reallocated.
+
+Returns:
+    Resulting RGB image slice.
+
+Note:
+    Input and pre-allocated slices' strides must be identical.
 */
-Slice!(3, V*) yuv2rgb(V)(Slice!(3, V*) range, Slice!(3, V*) prealloc = emptySlice!(3, V))
+Slice!(3, V*) yuv2rgb(V)(Slice!(3, V*) input, Slice!(3, V*) prealloc = emptySlice!(3, V)) pure nothrow
+in
 {
+    assert(input.length!2 == 3, "Invalid channel count.");
+}
+body
+{
+    if (prealloc.shape != input.shape)
+        prealloc = uninitializedSlice!V(input.shape);
 
-    enforce(range.length!2 == 3, "Invalid channel count.");
-
-    if (prealloc.empty || prealloc.shape[].equal(range.shape[]))
-    {
-        prealloc = uninitializedArray!(V[])(range.length!0 * range.length!1 * 3).sliced(range.length!0,
-                range.length!1, 3);
-    }
+    assert(input.structure.strides == prealloc.structure.strides,
+            "Input image and pre-allocated buffer strides are not identical.");
 
-    foreach (yuv, rgb; lockstep(range.pack!1.byElement, prealloc.pack!1.byElement))
-    {
-        auto y = cast(int)(yuv[0]) - 16;
-        auto u = cast(int)(yuv[1]) - 128;
-        auto v = cast(int)(yuv[2]) - 128;
-        static if (isFloatingPoint!V)
-        {
-            rgb[0] = clip!V(y + 1.4075 * v);
-            rgb[1] = clip!V(y - 0.3455 * u - (0.7169 * v));
-            rgb[2] = clip!V(y + 1.7790 * u);
-        }
-        else
-        {
-            rgb[0] = clip!V((298 * y + 409 * v + 128) >> 8);
-            rgb[1] = clip!V((298 * y - 100 * u - 208 * v + 128) >> 8);
-            rgb[2] = clip!V((298 * y + 516 * u + 128) >> 8);
-        }
-    }
+    auto p = assumeSameStructure!("yuv", "rgb")(input, prealloc).pack!1;
+    p.ndEach!(yuv2rgbImpl!(V, DeepElementType!(typeof(p))));
 
     return prealloc;
 }
@@ -534,44 +469,244 @@ unittest
     yuv2rgbTest(cast(ubyte[])[41, 240, 110], cast(ubyte[])[0, 0, 255]);
 }
 
-private:
+pure @nogc nothrow @fastmath:
+
+void rgb2grayImplMean(P)(P pack)
+{
+    alias V = typeof(pack.gray);
+    pack.gray = cast(V)((pack.rgb[0] + pack.rgb[1] + pack.rgb[2]) / 3);
+}
+
+void rgb2grayImplLuminance(RGBGRAY)(RGBGRAY pack)
+{
+    alias V = typeof(pack.gray);
+    pack.gray = cast(V)(
+            cast(float)pack.rgb[0] * 0.212642529f +
+            cast(float)pack.rgb[1] * 0.715143029f +
+            cast(float)pack.rgb[2] * 0.072214443f
+            );
+}
+
+void bgr2grayImplLuminance(RGBGRAY)(RGBGRAY pack)
+{
+    alias V = typeof(pack.gray);
+    pack.gray = cast(V)(
+            cast(float)pack.rgb[2] * 0.212642529f +
+            cast(float)pack.rgb[1] * 0.715143029f +
+            cast(float)pack.rgb[0] * 0.072214443f
+            );
+}
 
-immutable real[][] rgb2GrayMltp = [[0.3333, 0.3333, 0.3333], [0.2126, 0.715, 0.0722]];
+void gray2rgbImpl(GRAYRGB)(GRAYRGB pack)
+{
+    auto v = pack.gray;
+    pack.rgb[0] = v;
+    pack.rgb[1] = v;
+    pack.rgb[2] = v;
+}
 
-Slice!(2, V*) rgb2grayImpl(V)(Slice!(3, V*) range, Slice!(2, V*) prealloc, in real[] m) pure nothrow
+void rgb2hsvImpl(RGBHSV)(RGBHSV pack)
 {
-    if (prealloc.empty)
+    import ldc.intrinsics : max = llvm_maxnum, min = llvm_minnum;
+
+    alias V = typeof(pack.rgb[0]);
+    alias R = typeof(pack.hsv[0]);
+
+    static if (is(V == ubyte))
+    {
+        auto r = cast(float)(pack.rgb[0]) * (1.0f / 255.0f);
+        auto g = cast(float)(pack.rgb[1]) * (1.0f / 255.0f);
+        auto b = cast(float)(pack.rgb[2]) * (1.0f / 255.0f);
+    }
+    else static if (is(V == ushort))
+    {
+        auto r = cast(float)(pack.rgb[0]) * (1.0f / 65535.0f);
+        auto g = cast(float)(pack.rgb[1]) * (1.0f / 65535.0f);
+        auto b = cast(float)(pack.rgb[2]) * (1.0f / 65535.0f);
+    }
+    else static if (isFloatingPoint!V)
+    {
+        // assumes rgb value range 0-1
+        auto r = cast(float)(pack.rgb[0]);
+        auto g = cast(float)(pack.rgb[1]);
+        auto b = cast(float)(pack.rgb[2]);
+    }
+    else
+    {
+        static assert(0, "Invalid RGB input type: " ~ V.stringof);
+    }
+
+    auto cmax = max(r, max(g, b));
+    auto cmin = min(r, min(g, b));
+    auto cdelta = cmax - cmin;
+
+    auto h = cast(R)((cdelta == 0) ? 0 : (cmax == r) ? 60.0f * ((g - b) / cdelta) : (cmax == g)
+            ? 60.0f * ((b - r) / cdelta + 2) : 60.0f * ((r - g) / cdelta + 4));
+
+    if (h < 0)
+        h += 360;
+
+    static if (isFloatingPoint!R)
+    {
+        auto s = cast(R)(cmax == 0 ? 0 : cdelta / cmax);
+        auto v = cast(R)(cmax);
+    }
+    else
     {
-        if (!(range.shape[0 .. 2][].equal(prealloc.shape[0 .. 2][])))
-            prealloc = uninitializedArray!(V[])(range.shape[0] * range.shape[1]).sliced(range.shape[0], range.shape[1]);
+        auto s = cast(R)(100.0 * (cmax == 0 ? 0 : cdelta / cmax));
+        auto v = cast(R)(100.0 * cmax);
     }
 
-    auto rp = range.pack!1;
+    pack.hsv[0] = h;
+    pack.hsv[1] = s;
+    pack.hsv[2] = v;
+}
+
+void hsv2rgbImpl(HSVRGB)(HSVRGB pack)
+{
+    alias V = typeof(pack.hsv[0]);
+    alias R = typeof(pack.rgb[0]);
+
+    float r, g, b, p, q, t;
 
-    auto rows = rp.length!0;
-    auto cols = rp.length!1;
+    static if (isFloatingPoint!V)
+    {
+        auto h = pack.hsv[0];
+        auto s = pack.hsv[1];
+        auto v = pack.hsv[2];
+    }
+    else
+    {
+        float h = cast(float)pack.hsv[0];
+        float s = cast(float)pack.hsv[1] * 0.01f;
+        float v = cast(float)pack.hsv[2] * 0.01f;
+    }
 
-    for (size_t i = 0; i < rows; ++i)
+    if (s <= 0.0f)
     {
-        auto g_row = prealloc[i, 0 .. cols];
-        auto rgb_row = rp[i, 0 .. cols];
-        size_t j = 0;
-        for (; j < cols; ++j)
+        static if (isFloatingPoint!R)
+        {
+            pack.rgb[0] = cast(R)v;
+            pack.rgb[1] = cast(R)v;
+            pack.rgb[2] = cast(R)v;
+        }
+        else
         {
-            auto rgb = rgb_row[j];
-            auto v = rgb[0] * m[0] + rgb[1] * m[1] + rgb[2] * m[2];
-            static if (isFloatingPoint!(typeof(v)) && !isFloatingPoint!V)
-            {
-                import std.math : floor;
-
-                g_row[j] = cast(V)(v + 0.5).floor;
-            }
-            else
-            {
-                g_row[j] = cast(V)v;
-            }
+            pack.rgb[0] = cast(R)(v * R.max);
+            pack.rgb[1] = cast(R)(v * R.max);
+            pack.rgb[2] = cast(R)(v * R.max);
         }
+        return;
     }
 
-    return prealloc;
+    if (v <= 0.0f)
+    {
+        pack.rgb[0] = cast(R)0;
+        pack.rgb[1] = cast(R)0;
+        pack.rgb[2] = cast(R)0;
+        return;
+    }
+
+    if (h >= 360.0f)
+        h = 0.0f;
+    else
+        h /= 60.0;
+
+    auto hh = cast(int)h;
+    auto ff = h - float(hh);
+
+    p = v * (1.0f - s);
+    q = v * (1.0f - (s * ff));
+    t = v * (1.0f - (s * (1.0f - ff)));
+
+    switch (hh)
+    {
+    case 0:
+        r = v;
+        g = t;
+        b = p;
+        break;
+    case 1:
+        r = q;
+        g = v;
+        b = p;
+        break;
+    case 2:
+        r = p;
+        g = v;
+        b = t;
+        break;
+
+    case 3:
+        r = p;
+        g = q;
+        b = v;
+        break;
+    case 4:
+        r = t;
+        g = p;
+        b = v;
+        break;
+    case 5:
+    default:
+        r = v;
+        g = p;
+        b = q;
+        break;
+    }
+
+    static if (isFloatingPoint!R)
+    {
+        pack.rgb[0] = cast(R)r;
+        pack.rgb[1] = cast(R)g;
+        pack.rgb[2] = cast(R)b;
+    }
+    else
+    {
+        pack.rgb[0] = cast(R)(r * R.max);
+        pack.rgb[1] = cast(R)(g * R.max);
+        pack.rgb[2] = cast(R)(b * R.max);
+    }
+}
+
+void rgb2yuvImpl(V, RGBYUV)(RGBYUV pack)
+{
+    static if (isFloatingPoint!V)
+    {
+        auto r = cast(int)pack[0].rgb;
+        auto g = cast(int)pack[1].rgb;
+        auto b = cast(int)pack[2].rgb;
+        pack[0].yuv = clip!V((r * .257) + (g * .504) + (b * .098) + 16);
+        pack[1].yuv = clip!V((r * .439) + (g * .368) + (b * .071) + 128);
+        pack[2].yuv = clip!V(-(r * .148) - (g * .291) + (b * .439) + 128);
+    }
+    else
+    {
+        auto r = pack[0].rgb;
+        auto g = pack[1].rgb;
+        auto b = pack[2].rgb;
+        pack[0].yuv = clip!V(((66 * (r) + 129 * (g) + 25 * (b) + 128) >> 8) + 16);
+        pack[1].yuv = clip!V(((-38 * (r) - 74 * (g) + 112 * (b) + 128) >> 8) + 128);
+        pack[2].yuv = clip!V(((112 * (r) - 94 * (g) - 18 * (b) + 128) >> 8) + 128);
+    }
+}
+
+void yuv2rgbImpl(V, YUVRGB)(YUVRGB pack)
+{
+    auto y = cast(int)(pack[0].yuv) - 16;
+    auto u = cast(int)(pack[1].yuv) - 128;
+    auto v = cast(int)(pack[2].yuv) - 128;
+    static if (isFloatingPoint!V)
+    {
+        pack[0].rgb = clip!V(y + 1.4075 * v);
+        pack[1].rgb = clip!V(y - 0.3455 * u - (0.7169 * v));
+        pack[2].rgb = clip!V(y + 1.7790 * u);
+    }
+    else
+    {
+        pack[0].rgb = clip!V((298 * y + 409 * v + 128) >> 8);
+        pack[1].rgb = clip!V((298 * y - 100 * u - 208 * v + 128) >> 8);
+        pack[2].rgb = clip!V((298 * y + 516 * u + 128) >> 8);
+    }
 }
+
diff --git a/source/dcv/imgproc/convolution.d b/source/dcv/imgproc/convolution.d
index 039a8a3..0763875 100644
--- a/source/dcv/imgproc/convolution.d
+++ b/source/dcv/imgproc/convolution.d
@@ -32,90 +32,93 @@ Authors: Relja Ljubobratovic
 
 License: $(LINK3 http://www.boost.org/LICENSE_1_0.txt, Boost Software License - Version 1.0).
 */
-
 module dcv.imgproc.convolution;
 
-/*
-v0.1 norm:
-conv (done)
-separable_conv
-
-v0.1+ plans:
-1d_conv_simd
-*/
-
-import std.traits : isAssignable;
-import std.range;
+import std.traits : isAssignable, ReturnType;
+import std.conv : to;
 import std.algorithm.comparison : equal;
-
-import std.algorithm.iteration : reduce;
-import std.algorithm.comparison : max, min;
-import std.exception : enforce;
 import std.parallelism : parallel, taskPool, TaskPool;
-import std.math : abs, floor;
+
+import ldc.attributes : fastmath;
 
 import mir.ndslice;
+import mir.ndslice.algorithm : ndReduce, Yes;
 
 import dcv.core.memory;
 import dcv.core.utils;
 
 /**
-Perform convolution to given range, using given kernel.
-Convolution is supported for 1, 2, and 3D slices.
+Perform convolution to given tensor, using given kernel.
+Convolution is supported for 1, 2, and 3 dimensional tensors.
 
 Params:
     bc = (Template parameter) Boundary Condition function used while indexing the image matrix.
-    range = Input range slice (1D, 2D, and 3D slice supported)
-    kernel = Convolution kernel slice. For 1D range, 1D kernel is expected. 
-    For 2D range, 2D kernele is expected. For 3D range, 2D or 3D kernel is expected - 
-    if 2D kernel is given, each item in kernel matrix is applied to each value in 
-    corresponding 2D coordinate in the range.
-    prealloc = Pre-allocated array where convolution result can be stored. Default 
+    input = Input tensor.
+    kernel = Convolution kernel tensor. For 1D input, 1D kernel is expected.
+    For 2D input, 2D kernel is expected. For 3D input, 2D or 3D kernel is expected -
+    if 2D kernel is given, each item in kernel matrix is applied to each value in
+    corresponding 2D coordinate in the input.
+    prealloc = Pre-allocated buffer where convolution result can be stored. Default
     value is emptySlice, where resulting array will be newly allocated. Also if
-    prealloc is not of same shape as input range, resulting array will be newly allocated. 
-    mask = Masking range. Convolution will skip each element where mask is 0. Default value
-    is empty slice, which tells that convolution will be performed on the whole range.
+    prealloc is not of same shape as input input, resulting array will be newly allocated.
+    mask = Masking input. Convolution will skip each element where mask is 0. Default value
+    is empty slice, which tells that convolution will be performed on the whole input.
     pool = Optional TaskPool instance used to parallelize computation.
 
 Returns:
-    Slice of resulting image after convolution.
+    Resulting image after convolution, of same type as input tensor.
+
+Note:
+    Input, mask and pre-allocated slices' strides must be the same.
 */
-Slice!(N, InputType*) conv(alias bc = neumann, InputType, KernelType, MaskType = InputType, size_t N, size_t NK)(Slice!(N,
-        InputType*) range, Slice!(NK, KernelType*) kernel, Slice!(N,
-        InputType*) prealloc = emptySlice!(N, InputType), Slice!(NK, MaskType*) mask = emptySlice!(NK, MaskType), TaskPool pool = taskPool)
+InputTensor conv
+    (alias bc = neumann, InputTensor, KernelTensor, MaskTensor = KernelTensor)
+    (InputTensor input, KernelTensor kernel, InputTensor prealloc = InputTensor.init,
+     MaskTensor mask = MaskTensor.init, TaskPool pool = taskPool)
+in
 {
+    static assert(isSlice!InputTensor, "Input tensor has to be of type mir.ndslice.slice.Slice");
+    static assert(isSlice!KernelTensor, "Kernel tensor has to be of type mir.ndslice.slice.Slice");
+    static assert(isSlice!MaskTensor, "Mask tensor has to be of type mir.ndslice.slice.Slice");
     static assert(isBoundaryCondition!bc, "Invalid boundary condition test function.");
-    static assert(isAssignable!(InputType, KernelType), "Uncompatible types for range and kernel");
+    static assert(isAssignable!(DeepElementType!InputTensor, DeepElementType!KernelTensor),
+            "Incompatible types for input and kernel");
 
-    if (!mask.empty && !mask.shape[].equal(range.shape[]))
-    {
-        import std.conv : to;
+    immutable N = ReturnType!(InputTensor.shape).length;
+    immutable NK = ReturnType!(KernelTensor.shape).length;
 
-        throw new Exception(
-                "Invalid mask shape: " ~ mask.shape[].to!string ~ ", range shape: " ~ range.shape[].to!string);
-    }
+    static assert(ReturnType!(MaskTensor.shape).length == NK, "Mask tensor has to be of same dimension as the kernel tensor.");
 
+    immutable invalidKernelMsg = "Invalid kernel dimension";
     static if (N == 1)
-    {
-        static assert(NK == 1, "Invalid kernel dimension");
-        return conv1Impl!bc(range, kernel, prealloc, mask, pool);
-    }
+        static assert(NK == 1, invalidKernelMsg);
     else static if (N == 2)
-    {
-        static assert(NK == 2, "Invalid kernel dimension");
-        return conv2Impl!bc(range, kernel, prealloc, mask, pool);
-    }
+        static assert(NK == 2, invalidKernelMsg);
     else static if (N == 3)
+        static assert(NK == 2, invalidKernelMsg);
+    else
+        static assert(0, "Convolution not implemented for given tensor dimension.");
+
+    assert(input.ptr != prealloc.ptr, "Preallocated and input buffer cannot point to the same memory.");
+
+    if (!mask.empty)
     {
-        static assert(NK == 2, "Invalid kernel dimension");
-        return conv3Impl!bc(range, kernel, prealloc, mask, pool);
+        assert(mask.shape == input.shape, "Invalid mask size. Should be of same size as input tensor.");
+        assert(input.structure.strides == mask.structure.strides, "Input input and mask need to have same strides.");
     }
+
+    if (prealloc.empty)
+        assert(input.stride!(N-1) == 1, "Input tensor has to be contiguous (i.e. input.stride!(N-1) == 1).");
     else
-    {
-        import std.conv : to;
+        assert(input.structure.strides == prealloc.structure.strides,
+                "Input input and result(preallocated) buffer need to have same strides.");
+}
+body
+{
+    if (prealloc.shape != input.shape)
+        prealloc = uninitializedSlice!(DeepElementType!InputTensor)(input.shape);
 
-        static assert(0, "Convolution over " ~ N.to!string ~ "D ranges is not implemented");
-    }
+    return convImpl!bc(input, kernel, prealloc, mask, pool);
 }
 
 unittest
@@ -130,119 +133,197 @@ unittest
 
 unittest
 {
-    import std.algorithm.comparison : equal;
-
-    auto image = new float[15 * 15].sliced(15, 15);
-    auto kernel = new float[3 * 3].sliced(3, 3);
+    auto image = slice!float(15, 15);
+    auto kernel = slice!float(3, 3);
     auto convres = conv(image, kernel);
-    assert(convres.shape[].equal(image.shape[]));
+    assert(convres.shape == image.shape);
 }
 
 unittest
 {
-    import std.algorithm.comparison : equal;
-
-    auto image = new float[15 * 15 * 3].sliced(15, 15, 3);
-    auto kernel = new float[3 * 3].sliced(3, 3);
+    auto image = slice!float(15, 15, 3);
+    auto kernel = slice!float(3, 3);
     auto convres = conv(image, kernel);
-    assert(convres.shape[].equal(image.shape[]));
+    assert(convres.shape == image.shape);
 }
 
-private:
-
-// TODO: implement SIMD
-Slice!(1, InputType*) conv1Impl(alias bc, InputType, KernelType, MaskType)(Slice!(1, InputType*) range,
-        Slice!(1, KernelType*) kernel, Slice!(1, InputType*) prealloc, Slice!(1, MaskType*) mask, TaskPool pool)
+nothrow @nogc @fastmath auto kapply(T)(T r, T i, T k)
 {
+    return r + i * k;
+}
 
-    if (prealloc.empty || prealloc.shape != range.shape)
-        prealloc = uninitializedArray!(InputType[])(cast(size_t)range.length).sliced(range.shape);
-
-    enforce(&range[0] != &prealloc[0], "Preallocated has to contain different data from that of a input range.");
+private:
 
-    auto rl = range.length;
-    int ks = cast(int)kernel.length; // kernel size
-    int kh = max(1, cast(int)(floor(cast(float)ks / 2.))); // kernel size half
-    int ke = cast(int)(ks % 2 == 0 ? kh - 1 : kh);
-    int rt = cast(int)(ks % 2 == 0 ? rl - 1 - kh : rl - kh); // range top
+auto convImpl
+    (alias bc = neumann, InputTensor, KernelTensor, MaskTensor)
+    (InputTensor input, KernelTensor kernel, InputTensor prealloc, MaskTensor mask, TaskPool pool) 
+if (ReturnType!(InputTensor.shape).length == 1)
+{
+    alias InputType = DeepElementType!InputTensor;
 
-    bool useMask = !mask.empty;
+    auto kl = kernel.length;
+    auto kh = kl / 2;
 
-    // run main (inner) loop
-    foreach (i; pool.parallel(iota(rl)))
+    if (mask.empty)
     {
-        if (useMask && !mask[i])
-            continue;
-        InputType v = 0;
-        for (int j = -kh; j < ke + 1; ++j)
+        auto packedWindows = assumeSameStructure!("result", "input")(prealloc, input).windows(kl);
+        foreach (p; pool.parallel(packedWindows))
         {
-            v += bc(range, i + j) * kernel[j + kh];
+            p[kh].result = ndReduce!(kapply!InputType, Yes.vectorized, Yes.fastmath)
+                                    (0.0f, p.ndMap!(p => p.input), kernel);
+        }
+    }
+    else
+    {
+        // TODO: extract masked convolution as separate function?
+        auto packedWindows = assumeSameStructure!("result", "input", "mask")(prealloc, input, mask).windows(kl);
+        foreach (p; pool.parallel(packedWindows))
+        {
+            if (p[$ / 2].mask)
+                p[$ / 2].result = ndReduce!(kapply!InputType)
+                                           (0.0f, p.ndMap!(p => p.input), kernel);
         }
-        prealloc[i] = v;
     }
 
+    handleEdgeConv1d!bc(input, prealloc, kernel, mask, 0, kl);
+    handleEdgeConv1d!bc(input, prealloc, kernel, mask, input.length - 1 - kh, input.length);
+
     return prealloc;
 }
 
-Slice!(2, InputType*) conv2Impl(alias bc, InputType, KernelType, MaskType)(Slice!(2, InputType*) range,
-        Slice!(2, KernelType*) kernel, Slice!(2, InputType*) prealloc, Slice!(2, MaskType*) mask, TaskPool pool)
+auto convImpl
+    (alias bc = neumann, InputTensor, KernelTensor, MaskTensor)
+    (InputTensor input, KernelTensor kernel, InputTensor prealloc, MaskTensor mask, TaskPool pool) 
+if (ReturnType!(InputTensor.shape).length == 2)
 {
+    auto krs = kernel.length!0; // kernel rows
+    auto kcs = kernel.length!1; // kernel rows
 
-    if (prealloc.empty || prealloc.shape != range.shape)
-        prealloc = uninitializedArray!(InputType[])(cast(size_t)range.elementsCount).sliced(range.shape);
-
-    enforce(&range[0, 0] != &prealloc[0, 0], "Preallocated has to contain different data from that of a input range.");
-
-    auto rr = range.length!0; // range rows
-    auto rc = range.length!1; // range columns
-
-    int krs = cast(int)kernel.length!0; // kernel rows
-    int kcs = cast(int)kernel.length!1; // kernel rows
-
-    int krh = max(1, cast(int)(floor(cast(float)krs / 2.))); // kernel rows size half
-    int kch = max(1, cast(int)(floor(cast(float)kcs / 2.))); // kernel rows size half
+    auto krh = krs / 2;
+    auto kch = kcs / 2;
 
-    bool useMask = !mask.empty;
+    auto useMask = !mask.empty;
 
-    // run inner body convolution of the matrix.
-    foreach (i; pool.parallel(iota(rr)))
+    if (mask.empty)
     {
-        auto row = prealloc[i, 0 .. rc];
-        foreach (j; iota(rc))
-        {
-            if (useMask && !mask[i, j])
-                continue;
-            InputType v = 0;
-            for (int ii = -krh; ii < krh + 1; ++ii)
+        auto packedWindows = assumeSameStructure!("result", "input")(prealloc, input).windows(krs, kcs);
+        foreach (prow; pool.parallel(packedWindows))
+            foreach (p; prow)
+            {
+                p[krh, kch].result = ndReduce!(kapply, Yes.vectorized, Yes.fastmath)
+                                              (0.0f, p.ndMap!(v => v.input), kernel);
+            }
+    }
+    else
+    {
+        auto packedWindows = assumeSameStructure!("result", "input", "mask")(prealloc, input, mask).windows(krs, kcs);
+        foreach (prow; pool.parallel(packedWindows))
+            foreach (p; prow)
             {
-                for (int jj = -kch; jj < kch + 1; ++jj)
+                if (p[krh, kch].mask)
                 {
-                    v += bc(range, i + ii, j + jj) * kernel[ii + krh, jj + kch];
+                    p[krh, kch].result = ndReduce!(kapply, Yes.vectorized, Yes.fastmath)
+                                                  (0.0f, p.ndMap!(v => v.input), kernel);
                 }
             }
-            row[j] = v;
-        }
     }
 
+    handleEdgeConv2d!bc(input, prealloc, kernel, mask, [0, input.length!0], [0, kch]); // upper row
+    handleEdgeConv2d!bc(input, prealloc, kernel, mask, [0, input.length!0], [input.length!1 - kch, input.length!1]); // lower row
+    handleEdgeConv2d!bc(input, prealloc, kernel, mask, [0, krh], [0, input.length!1]); // left column
+    handleEdgeConv2d!bc(input, prealloc, kernel, mask, [input.length!0 - krh, input.length!0], [0, input.length!1]); // right column
+
     return prealloc;
 }
 
-Slice!(3, InputType*) conv3Impl(alias bc, InputType, KernelType, MaskType, size_t NK)(Slice!(3,
-        InputType*) range, Slice!(NK, KernelType*) kernel, Slice!(3, InputType*) prealloc,
-        Slice!(NK, MaskType*) mask, TaskPool pool)
+auto convImpl
+    (alias bc = neumann, InputTensor, KernelTensor, MaskTensor)
+    (InputTensor input, KernelTensor kernel, InputTensor prealloc, MaskTensor mask, TaskPool pool) 
+if (ReturnType!(InputTensor.shape).length == 3)
 {
-    if (prealloc.empty || prealloc.shape != range.shape)
-        prealloc = uninitializedArray!(InputType[])(cast(size_t)range.elementsCount).sliced(range.shape);
-
-    enforce(&range[0, 0, 0] != &prealloc[0, 0, 0],
-            "Preallocated has to contain different data from that of a input range.");
-
-    foreach (i; iota(range.length!2))
+    foreach (i; 0 .. input.length!2)
     {
-        auto r_c = range[0 .. $, 0 .. $, i];
+        auto r_c = input[0 .. $, 0 .. $, i];
         auto p_c = prealloc[0 .. $, 0 .. $, i];
         r_c.conv(kernel, p_c, mask, pool);
     }
 
     return prealloc;
 }
+
+void handleEdgeConv1d(alias bc, T, K, M)(Slice!(1, T*) input, Slice!(1, T*) prealloc, Slice!(1,
+        K*) kernel, Slice!(1, M*) mask, size_t from, size_t to)
+in
+{
+    assert(from < to);
+}
+body
+{
+    int kl = cast(int)kernel.length;
+    int kh = kl / 2, i = cast(int)from, j;
+
+    bool useMask = !mask.empty;
+
+    T t;
+    foreach (ref p; prealloc[from .. to])
+    {
+        if (useMask && mask[i] <= 0)
+            goto loop_end;
+        t = 0;
+        j = -kh;
+        foreach (k; kernel)
+        {
+            t += bc(input, i + j) * k;
+            ++j;
+        }
+        p = t;
+    loop_end:
+        ++i;
+    }
+}
+
+void handleEdgeConv2d(alias bc, T, K, M)(Slice!(2, T*) input, Slice!(2, T*) prealloc, Slice!(2,
+        K*) kernel, Slice!(2, M*) mask, size_t[2] rowRange, size_t[2] colRange)
+in
+{
+    assert(rowRange[0] < rowRange[1]);
+    assert(colRange[0] < colRange[1]);
+}
+body
+{
+    int krl = cast(int)kernel.length!0;
+    int kcl = cast(int)kernel.length!1;
+    int krh = krl / 2, kch = kcl / 2;
+    int r = cast(int)rowRange[0], c, i, j;
+
+    bool useMask = !mask.empty;
+
+    auto roi = prealloc[rowRange[0] .. rowRange[1], colRange[0] .. colRange[1]];
+
+    T t;
+    foreach (prow; roi)
+    {
+        c = cast(int)colRange[0];
+        foreach (ref p; prow)
+        {
+            if (useMask && mask[r, c] <= 0)
+                goto loop_end;
+            t = 0;
+            i = -krh;
+            foreach (krow; kernel)
+            {
+                j = -kch;
+                foreach (k; krow)
+                {
+                    t += bc(input, r + i, c + j) * k;
+                    ++j;
+                }
+                ++i;
+            }
+            p = t;
+        loop_end:
+            ++c;
+        }
+        ++r;
+    }
+}
diff --git a/source/dcv/imgproc/filter.d b/source/dcv/imgproc/filter.d
index 0ed05a6..4263087 100644
--- a/source/dcv/imgproc/filter.d
+++ b/source/dcv/imgproc/filter.d
@@ -52,11 +52,15 @@ import std.algorithm.comparison : max;
 import std.algorithm.mutation : copy;
 import std.array : uninitializedArray;
 import std.parallelism : parallel, taskPool, TaskPool;
+import std.traits: ReturnType;
+
+import ldc.attributes : fastmath;
 
 import dcv.core.algorithm;
 import dcv.core.utils;
 
 import mir.ndslice;
+import mir.ndslice.algorithm : ndReduce, ndEach, Yes;
 
 /**
 Box kernel creation.
@@ -399,57 +403,31 @@ unittest
 /**
 Perform non-maxima filtering of the image.
 
-note: 
-proxy function, not a proper API! 
+Params:
+    input = Input matrix.
+    filterSize = Size of filtering kernel (aperture).
 
+Returns:
+    Input matrix, after filtering.
 */
-Slice!(2, T*) filterNonMaximum(T)(Slice!(2, T*) slice, size_t filterSize = 10)
+auto filterNonMaximum(Matrix)(Matrix input, size_t filterSize = 10)
+in
 {
+    static assert(isSlice!Matrix, "Input value should be of type mir.ndslice.slice.Slice.");
+    static assert(ReturnType!(Matrix.shape).length == 2, "Invalid Slice dimensionality - should be Matrix(2).");
+    assert(!input.empty && filterSize);
+}
+body
+{
+    immutable fs = filterSize;
+    immutable fsh = max(1, fs / 2);
 
-    assert(!slice.empty && filterSize);
-
-    typeof(slice) lmsw; // local maxima search window
-    int lms_r, lms_c;
-    int win_rows, win_cols;
-    float lms_val;
-    auto rows = slice.length!0;
-    auto cols = slice.length!1;
-
-    for (int br = 0; br < rows; br += filterSize / 2)
-    {
-        for (int bc = 0; bc < cols; bc += filterSize / 2)
-        {
-            win_rows = cast(int)((br + filterSize < rows) ? filterSize : filterSize - ((br + filterSize) - rows) - 1);
-            win_cols = cast(int)((bc + filterSize < cols) ? filterSize : filterSize - ((bc + filterSize) - cols) - 1);
-
-            if (win_rows <= 0 || win_cols <= 0)
-            {
-                continue;
-            }
-
-            lmsw = slice[br .. br + win_rows, bc .. bc + win_cols];
+    input
+        .windows(fs, fs)
+        .strided!(0, 1)(fsh, fsh)
+        .ndEach!( w => filterNonMaximumImpl(w));
 
-            lms_val = -1;
-            for (int r = 0; r < lmsw.length!0; r++)
-            {
-                for (int c = 0; c < lmsw.length!1; c++)
-                {
-                    if (lmsw[r, c] > lms_val)
-                    {
-                        lms_val = lmsw[r, c];
-                        lms_r = r;
-                        lms_c = c;
-                    }
-                }
-            }
-            lmsw[] = cast(T)0;
-            if (lms_val != -1)
-            {
-                lmsw[lms_r, lms_c] = cast(T)lms_val;
-            }
-        }
-    }
-    return slice;
+    return input;
 }
 
 /**
@@ -458,37 +436,35 @@ Calculate partial derivatives of an slice.
 Partial derivatives are calculated by convolving an slice with
 [-1, 1] kernel, horizontally and vertically.
 */
-void calcPartialDerivatives(T, V = T)(Slice!(2, T*) slice, ref Slice!(2, V*) fx, ref Slice!(2, V*) fy)
-        if (isFloatingPoint!V)
+void calcPartialDerivatives(InputTensor, V = DeepElementType!InputTensor)
+    (InputTensor input, ref Slice!(2, V*) fx, ref Slice!(2, V*) fy, TaskPool pool = taskPool) if (isFloatingPoint!V)
 in
 {
-    assert(!slice.empty);
+    static assert(isSlice!InputTensor, "Invalid input tensor type - has to be of type mir.ndslice.slice.Slice.");
+    static assert(ReturnType!(InputTensor.shape).length == 2, "Input tensor has to be 2 dimensional. (matrix)");
 }
 body
 {
-    import std.range : iota;
-    import std.array : array, uninitializedArray;
-    import std.algorithm : equal, reduce;
+    if(input.empty)
+        return;
 
-    auto itemLength = slice.elementsCount;
-    if (!fx.shape[].equal(slice.shape[]))
-        fx = uninitializedArray!(V[])(itemLength).sliced(slice.shape);
-    if (!fy.shape[].equal(slice.shape[]))
-        fy = uninitializedArray!(V[])(itemLength).sliced(slice.shape);
+    if (fx.shape != input.shape)
+        fx = uninitializedSlice!V(input.shape);
+    if (fy.shape != input.shape)
+        fy = uninitializedSlice!V(input.shape);
 
-    auto rows = slice.length!0;
-    auto cols = slice.length!1;
+    auto rows = input.length!0;
+    auto cols = input.length!1;
 
-    // calc mid-ground
-    foreach (r; 1.iota(rows))
+    foreach (r; pool.parallel(iota(1, rows)))
     {
         auto x_row = fx[r, 0 .. $];
         auto y_row = fy[r, 0 .. $];
-        foreach (c; 1.iota(cols))
+        foreach (c; 1 .. cols)
         {
-            auto imrc = slice[r, c];
-            x_row[c] = cast(V)(-1. * slice[r, c - 1] + imrc);
-            y_row[c] = cast(V)(-1. * slice[r - 1, c] + imrc);
+            auto imrc = input[r, c];
+            x_row[c] = cast(V)(-1. * input[r, c - 1] + imrc);
+            y_row[c] = cast(V)(-1. * input[r - 1, c] + imrc);
         }
     }
 
@@ -496,67 +472,70 @@ body
     auto x_row = fx[0, 0 .. $];
     auto y_row = fy[0, 0 .. $];
 
-    foreach (c; 0.iota(cols - 1))
+    foreach (c; pool.parallel(iota(cols - 1)))
     {
-        auto im_0c = slice[0, c];
-        x_row[c] = cast(V)(-1. * im_0c + slice[0, c + 1]);
-        y_row[c] = cast(V)(-1. * im_0c + slice[1, c]);
+        auto im_0c = input[0, c];
+        x_row[c] = cast(V)(-1. * im_0c + input[0, c + 1]);
+        y_row[c] = cast(V)(-1. * im_0c + input[1, c]);
     }
 
     auto x_col = fx[0 .. $, 0];
     auto y_col = fy[0 .. $, 0];
 
-    foreach (r; iota(rows - 1))
+    foreach (r; pool.parallel(iota(rows - 1)))
     {
-        auto im_r_0 = slice[r, 0];
-        x_col[r] = cast(V)(-1. * im_r_0 + slice[r, 1]);
-        y_col[r] = cast(V)(-1. * im_r_0 + slice[r + 1, 0]);
+        auto im_r_0 = input[r, 0];
+        x_col[r] = cast(V)(-1. * im_r_0 + input[r, 1]);
+        y_col[r] = cast(V)(-1. * im_r_0 + input[r + 1, 0]);
     }
 
     // edges corner pixels
-    fx[0, cols - 1] = cast(V)(-1 * slice[0, cols - 2] + slice[0, cols - 1]);
-    fy[0, cols - 1] = cast(V)(-1 * slice[0, cols - 1] + slice[1, cols - 1]);
-    fx[rows - 1, 0] = cast(V)(-1 * slice[rows - 1, 0] + slice[rows - 1, 1]);
-    fy[rows - 1, 0] = cast(V)(-1 * slice[rows - 2, 0] + slice[rows - 1, 0]);
+    fx[0, cols - 1] = cast(V)(-1 * input[0, cols - 2] + input[0, cols - 1]);
+    fy[0, cols - 1] = cast(V)(-1 * input[0, cols - 1] + input[1, cols - 1]);
+    fx[rows - 1, 0] = cast(V)(-1 * input[rows - 1, 0] + input[rows - 1, 1]);
+    fy[rows - 1, 0] = cast(V)(-1 * input[rows - 2, 0] + input[rows - 1, 0]);
 }
 
 /**
 Calculate gradient magnitude and orientation of an image slice.
 
 Params:
-    slice = Input slice of an image.
-    mag = Output magnitude value of gradients.
-    orient = Orientation value of gradients in radians.
+    input = Input slice of an image.
+    mag = Output magnitude value of gradients. If shape does not correspond to input, is
+        allocated anew.
+    orient = Orientation value of gradients in radians. If shape does not correspond to
+        input, is allocated anew.
     edgeKernelType = Optional convolution kernel type to calculate partial derivatives. 
     Default value is EdgeKernel.SIMPLE, which calls calcPartialDerivatives function
     to calculate derivatives. Other options will perform convolution with requested
     kernel type.
+
+Note:
+    Input slice's memory has to be contiguous. Magnitude and orientation slices' strides
+    have to be the identical.
 */
-void calcGradients(T, V = T)(Slice!(2, T*) slice, ref Slice!(2, V*) mag, ref Slice!(2, V*) orient,
-        EdgeKernel edgeKernelType = EdgeKernel.SIMPLE) if (isFloatingPoint!V)
+void calcGradients(InputTensor, V = DeepElementType!InputTensor)
+    (InputTensor input, ref Slice!(2, V*) mag, ref Slice!(2, V*) orient,
+     EdgeKernel edgeKernelType = EdgeKernel.SIMPLE, TaskPool pool = taskPool) if (isFloatingPoint!V)
 in
 {
-    assert(!slice.empty);
+    static assert(isSlice!InputTensor, "Input tensor has to be of type mir.ndslice.slice.Slice");
+    static assert(ReturnType!(InputTensor.shape).length == 2, "Input tensor has to be 2 dimensional. (matrix)");
+    assert(!input.empty);
+    assert(input.structure.strides[$-1] == 1, "Input slice's memory has to be contiguous."); // TODO check other dimensions.
 }
 body
 {
-    import std.array : uninitializedArray;
-    import std.math : sqrt, atan2;
+    if (mag.shape != input.shape)
+        mag = uninitializedSlice!V(input.shape);
 
-    if (mag.shape[] != slice.shape[])
-    {
-        mag = uninitializedArray!(V[])(slice.length!0 * slice.length!1).sliced(slice.shape);
-    }
-
-    if (orient.shape[] != slice.shape[])
-    {
-        orient = uninitializedArray!(V[])(slice.length!0 * slice.length!1).sliced(slice.shape);
-    }
+    if (orient.shape != input.shape)
+        orient = uninitializedSlice!V(input.shape);
 
     Slice!(2, V*) fx, fy;
     if (edgeKernelType == EdgeKernel.SIMPLE)
     {
-        calcPartialDerivatives(slice, fx, fy);
+        calcPartialDerivatives(input, fx, fy, pool);
     }
     else
     {
@@ -565,130 +544,99 @@ body
         Slice!(2, V*) kx, ky;
         kx = edgeKernel!V(edgeKernelType, GradientDirection.DIR_X);
         ky = edgeKernel!V(edgeKernelType, GradientDirection.DIR_Y);
-        fx = slice.conv(kx);
-        fy = slice.conv(ky);
+        fx = input.conv(kx, emptySlice!(2, V), emptySlice!(2, V), pool);
+        fy = input.conv(ky, emptySlice!(2, V), emptySlice!(2, V), pool);
     }
 
-    foreach (i; 0 .. slice.length!0)
+    assert(fx.structure.strides == mag.structure.strides  ||
+            fx.structure.strides == orient.structure.strides,
+            "Magnitude and orientation slices must be contiguous.");
+
+    if (mag.structure.strides == orient.structure.strides &&
+            mag.structure.strides == fx.structure.strides)
     {
-        foreach (j; 0 .. slice.length!1)
+        auto data = assumeSameStructure!("fx", "fy", "mag", "orient")(fx, fy, mag, orient);
+        foreach(row; pool.parallel(data))
         {
-            mag[i, j] = cast(V)sqrt(fx[i, j] ^^ 2 + fy[i, j] ^^ 2);
-            orient[i, j] = cast(V)atan2(fy[i, j], fx[i, j]);
+            row.ndEach!( p => calcGradientsImpl(p.fx, p.fy, p.mag, p.orient), Yes.vectorized, Yes.fastmath);
         }
     }
+    else
+    {
+        foreach(row; pool.parallel(indexSlice(input.shape)))
+        {
+            row.ndEach!(i => calcGradientsImpl(fx[i], fy[i], mag[i], orient[i]), Yes.vectorized, Yes.fastmath);
+        }
+    }
+}
 
+@fastmath void calcGradientsImpl(T)(T fx, T fy, ref T mag, ref T orient)
+{
+    import ldc.intrinsics : sqrt = llvm_sqrt;
+    import std.math : atan2;
+    mag = sqrt(fx*fx + fy*fy);
+    orient = atan2(fy, fx);
 }
 
 /**
-Edge detection impuls non-maxima supression.
+Edge detection impulse non-maxima supression.
 
 Filtering used in canny edge detection algorithm - suppresses all 
 edge impulses (gradient values along edges normal) except the peek value.
 
 Params:
-mag = Gradient magnitude.
-orient = Gradient orientation of the same image source as magnitude.
-prealloc = Optional pre-allocated buffer for output slice.
+    mag = Gradient magnitude.
+    orient = Gradient orientation of the same image source as magnitude.
+    prealloc = Optional pre-allocated buffer for output slice.
 
-see:
-dcv.imgproc.filter.calcGradients, dcv.imgproc.convolution
+Note:
+    Orientation and pre-allocated structures must match. If prealloc
+    buffer is not given, orient memory has to be contiguous.
+See:
+    dcv.imgproc.filter.calcGradients, dcv.imgproc.convolution
 */
-Slice!(2, V*) nonMaximaSupression(T, V = T)(Slice!(2, T*) mag, Slice!(2, T*) orient, Slice!(2,
-        V*) prealloc = emptySlice!(2, V))
+Slice!(2, V*) nonMaximaSupression(InputTensor, V = DeepElementType!InputTensor)
+(InputTensor mag, InputTensor orient, Slice!(2, V*) prealloc = emptySlice!(2, V), TaskPool pool = taskPool)
 in
 {
+    static assert(isSlice!InputTensor, "Input tensor has to be of type mir.ndslice.slice.Slice");
+    static assert(ReturnType!(InputTensor.shape).length == 2, "Input tensor has to be 2 dimensional. (matrix)");
+
     assert(!mag.empty && !orient.empty);
-    assert(mag.shape[] == orient.shape[]);
+    assert(mag.shape == orient.shape);
+    assert(mag.structure.strides == orient.structure.strides, "Magnitude and Orientation tensor strides have to be the same.");
 }
 body
 {
     import std.array : uninitializedArray;
-    import std.math : PI, abs;
+    import std.math : PI;
 
-    if (prealloc.shape[] != orient.shape[])
-    {
-        prealloc = uninitializedArray!(V[])(mag.length!0 * mag.length!1).sliced(mag.shape);
-    }
+    alias F = DeepElementType!InputTensor;
 
-    size_t[2] p0;
-    size_t[2] p1;
+    static if (isFloatingPoint!F)
+        import ldc.intrinsics : abs = llvm_fabs;
+    else
+        import std.math : abs;
 
-    foreach (i; 1 .. mag.length!0 - 1)
-    {
-        foreach (j; 1 .. mag.length!1 - 1)
-        {
-            auto ang = orient[i, j];
-            auto aang = abs(ang);
+    if (prealloc.shape != orient.shape
+            || prealloc.structure.strides != mag.structure.strides)
+        prealloc = uninitializedSlice!V(mag.shape);
 
-            immutable pi = 3.15;
-            immutable pi8 = pi / 8.0;
+    assert(prealloc.structure.strides == orient.structure.strides,
+            "Orientation and preallocated slice strides do not match.");
 
-            if (aang <= pi && aang > 7.0 * pi8)
-            {
-                p0[0] = j - 1;
-                p0[1] = i;
-                p1[0] = j + 1;
-                p1[1] = i;
-            }
-            else if (ang >= -7.0 * pi8 && ang < -5.0 * pi8)
-            {
-                p0[0] = j - 1;
-                p0[1] = i - 1;
-                p1[0] = j + 1;
-                p1[1] = i + 1;
-            }
-            else if (ang <= 7.0 * pi8 && ang > 5.0 * pi8)
-            {
-                p0[0] = j + 1;
-                p0[1] = i - 1;
-                p1[0] = j - 1;
-                p1[1] = i + 1;
-            }
-            else if (ang >= pi8 && ang < 3.0 * pi8)
-            {
-                p0[0] = j - 1;
-                p0[1] = i + 1;
-                p1[0] = j + 1;
-                p1[1] = i - 1;
-            }
-            else if (ang <= -pi8 && ang > -3.0 * pi8)
-            {
-                p0[0] = j + 1;
-                p0[1] = i + 1;
-                p1[0] = j - 1;
-                p1[1] = i - 1;
-            }
-            else if (ang >= -5.0 * pi8 && ang < -3.0 * pi8)
-            {
-                p0[0] = j;
-                p0[1] = i - 1;
-                p1[0] = j;
-                p1[1] = i + 1;
-            }
-            else if (ang <= 5.0 * pi8 && ang > 3.0 * pi8)
-            {
-                p0[0] = j;
-                p0[1] = i + 1;
-                p1[0] = j;
-                p1[1] = i - 1;
-            }
-            else if (aang >= 0.0 && aang < pi8)
-            {
-                p0[0] = j + 1;
-                p0[1] = i;
-                p1[0] = j - 1;
-                p1[1] = i;
-            }
+    auto magWindows = mag.windows(3, 3);
+    auto dPack = assumeSameStructure!("result", "ang")(prealloc[1 .. $-1, 1 .. $-1], orient[1 .. $-1, 1 .. $-1]);
 
-            if (mag[p1[1], p1[0]] <= mag[p0[1], p0[0]])
-            {
-                prealloc[i, j] = 0;
-            }
-            else
-            {
-                prealloc[i, j] = mag[i, j];
-            }
+    auto innerShape = magWindows.shape;
+
+    foreach(r; pool.parallel(iota(innerShape[0])))
+    {
+        auto d = dPack[r];
+        auto m = magWindows[r];
+        foreach(c; 0 .. innerShape[1])
+        {
+            nonMaximaSupressionImpl(d[c], m[c]);
         }
     }
 
@@ -706,7 +654,8 @@ Params:
     prealloc = Optional pre-allocated buffer.
 */
 Slice!(2, V*) canny(V, T)(Slice!(2, T*) slice, T lowThresh, T upThresh,
-        EdgeKernel edgeKernelType = EdgeKernel.SOBEL, Slice!(2, V*) prealloc = emptySlice!(2, V))
+        EdgeKernel edgeKernelType = EdgeKernel.SOBEL,
+        Slice!(2, V*) prealloc = emptySlice!(2, V), TaskPool pool = taskPool)
 {
     import dcv.imgproc.threshold;
     import dcv.core.algorithm : ranged;
@@ -715,11 +664,10 @@ Slice!(2, V*) canny(V, T)(Slice!(2, T*) slice, T lowThresh, T upThresh,
 
     Slice!(2, float*) mag, orient;
     calcGradients(slice, mag, orient, edgeKernelType);
-    auto nonmax = nonMaximaSupression(mag, orient);
 
-    return nonmax
+    return nonMaximaSupression(mag, orient, emptySlice!(2, T), pool)
         .ranged(0, upval)
-        .threshold!V(lowThresh, upThresh);
+        .threshold(lowThresh, upThresh, prealloc);
 }
 
 /**
@@ -751,95 +699,81 @@ Params:
 Returns:
     Slice of filtered image.
 */
-Slice!(N, OutputType*) bilateralFilter(alias bc = neumann, InputType, OutputType = InputType, size_t N)(Slice!(N,
-        InputType*) slice, float sigmaCol, float sigmaSpace, uint kernelSize, Slice!(N,
-        OutputType*) prealloc = emptySlice!(N, OutputType), TaskPool pool = taskPool) if (N == 2)
+Slice!(N, OutputType*) bilateralFilter
+    (alias bc = neumann, InputTensor, OutputType = DeepElementType!(InputTensor), size_t N = ReturnType!(InputTensor.shape).length)
+    (InputTensor input, float sigmaCol, float sigmaSpace, size_t kernelSize, Slice!(N, OutputType*) prealloc = emptySlice!(N, OutputType),
+     TaskPool pool = taskPool) if (N == 2)
 in
 {
+    static assert(isSlice!InputTensor, "Input tensor has to be of type mir.ndslice.slice.Slice");
     static assert(isBoundaryCondition!bc, "Invalid boundary condition test function.");
-    assert(!slice.empty);
+    assert(!input.empty);
     assert(kernelSize % 2);
 }
 body
 {
-    import std.algorithm.comparison : max;
-    import std.algorithm.iteration : reduce;
-    import std.math : exp, sqrt;
-    import mir.glas.l1 : asum;
+    if (prealloc.shape != input.shape)
+        prealloc = uninitializedSlice!OutputType(input.shape);
 
-    if (prealloc.empty || prealloc.shape != slice.shape)
-    {
-        prealloc = uninitializedArray!(OutputType[])(slice.elementsCount).sliced(slice.shape);
-    }
+    auto ks = kernelSize;
+    auto kh = max(1, ks / 2);
 
-    int ks = cast(int)kernelSize;
-    int ksh = max(1, ks / 2);
-    int rows = cast(int)slice.length!0;
-    int cols = cast(int)slice.length!1;
+    auto inputWindows = input.windows(kernelSize, kernelSize);
+    auto innerBody = prealloc[kh .. $ - kh, kh .. $ - kh];
+    auto inShape = innerBody.shape;
+    auto shape = input.shape;
+    auto kId = indexSlice(ks, ks).ndMap!(v => [cast(int)v[0] - cast(int)kh, cast(int)v[1] - cast(int)kh]).slice;
 
-    foreach (r; pool.parallel(iota(rows)))
+    auto threadMask = pool.workerLocalStorage(slice!float(ks, ks));
+
+    foreach(r; pool.parallel(iota(inShape[0])))
     {
-        auto mask = new float[ks * ks].sliced(ks, ks);
-        foreach (c; 0 .. cols)
+        auto maskBuf = threadMask.get();
+        foreach(c; 0 .. inShape[1])
         {
-            auto p_val = slice[r, c];
-
-            // generate mask
-            int i = 0;
-            int j;
-            foreach (int kr; r - ksh .. r + ksh + 1)
-            {
-                j = 0;
-                auto rk = (r - kr) ^^ 2;
-                foreach (int kc; c - ksh .. c + ksh + 1)
-                {
-                    auto ck = (c - kc) ^^ 2;
-                    auto cdiff = bc(slice, kr, kc) - p_val;
-                    float c_val = exp((ck + rk) / (-2.0f * sigmaSpace * sigmaSpace));
-                    float s_val = exp((cdiff * cdiff) / (-2.0f * sigmaCol * sigmaCol));
-                    mask[i, j] = c_val * s_val;
-                    ++j;
-                }
-                ++i;
-            }
-
-            // normalize mask and calculate result value.
-            float mask_sum = mask.asum;
-            float res_val = 0.0f;
+            innerBody[r, c] = cast(OutputType)bilateralFilterImpl(inputWindows[r, c], maskBuf, sigmaCol, sigmaSpace);
+        }
+    }
 
-            i = 0;
-            foreach (kr; r - ksh .. r + ksh + 1)
+    foreach(border; pool.parallel(input.shape.borders(ks)[]))
+    {
+        auto maskBuf = threadMask.get();
+        foreach(r; border.rows)
+            foreach(c; border.cols)
             {
-                j = 0;
-                foreach (kc; c - ksh .. c + ksh + 1)
-                {
-                    res_val += (mask[i, j] / mask_sum) * bc(slice, kr, kc);
-                    ++j;
-                }
-                ++i;
+                auto inputWindow = kId.ndMap!(v => bc(input, r - v[0], c - v[1]));
+                prealloc[r, c] = cast(OutputType)bilateralFilterImpl(inputWindow, maskBuf, sigmaCol, sigmaSpace);
             }
-
-            prealloc[r, c] = cast(OutputType)res_val;
-        }
     }
 
     return prealloc;
 }
 
 /// ditto
-Slice!(N, OutputType*) bilateralFilter(alias bc = neumann, InputType, OutputType = InputType, size_t N)(Slice!(N,
-        InputType*) slice, float sigmaCol, float sigmaSpace, uint kernelSize, Slice!(N,
-        OutputType*) prealloc = emptySlice!(N, OutputType)) if (N == 3)
+Slice!(N, OutputType*) bilateralFilter
+    (alias bc = neumann, InputTensor, OutputType = DeepElementType!(InputTensor), size_t N = ReturnType!(InputTensor.shape).length)
+    (InputTensor input, float sigmaCol, float sigmaSpace, size_t kernelSize, Slice!(N, OutputType*) prealloc = emptySlice!(N, OutputType),
+     TaskPool pool = taskPool) if (N == 3)
+in
+{
+    static assert(isSlice!InputTensor, "Input tensor has to be of type mir.ndslice.slice.Slice");
+    static assert(isBoundaryCondition!bc, "Invalid boundary condition test function.");
+    static assert(isFloatingPoint!(DeepElementType!InputTensor), "Input tensor value type has to be of floating point type.");
+    assert(!input.empty);
+    assert(kernelSize % 2);
+}
+body
 {
-    if (prealloc.empty || prealloc.shape != slice.shape)
+    if (prealloc.empty || prealloc.shape != input.shape)
     {
-        prealloc = uninitializedArray!(OutputType[])(slice.elementsCount).sliced(slice.shape);
+        prealloc = uninitializedSlice!OutputType(input.shape);
     }
 
-    foreach (channel; 0 .. slice.length!2)
+    foreach (channel; 0 .. input.length!2)
     {
-        bilateralFilter!(bc, InputType, OutputType)(slice[0 .. $, 0 .. $, channel], sigmaCol, sigmaSpace,
-                kernelSize, prealloc[0 .. $, 0 .. $, channel]);
+        auto inch = input[0 .. $, 0 .. $, channel];
+        auto prech = prealloc[0 .. $, 0 .. $, channel];
+        bilateralFilter!(bc, typeof(inch), OutputType, 2)(inch, sigmaCol, sigmaSpace, kernelSize, prech, pool);
     }
 
     return prealloc;
@@ -1229,8 +1163,110 @@ Slice!(2, T*) close(alias BoundaryConditionTest = neumann, T)(Slice!(2, T*) slic
             BoundaryConditionTest)(slice, kernel, emptySlice!(2, T), pool), kernel, prealloc, pool);
 }
 
+@fastmath void calcBilateralMask(Window, Mask)(Window window, Mask mask, float sigmaCol, float sigmaSpace)
+{
+    import ldc.intrinsics : exp = llvm_exp, sqrt = llvm_sqrt;
+
+    auto in_val = window[$ / 2, $ / 2];
+    float rd, cd, c_val, s_val;
+    float i, j, wl2;
+    wl2 = -(cast(float)window.length!0 / 2.0f);
+    i = wl2;
+    foreach (r; 0 .. window.length!0)
+    {
+        rd = i * i;
+        j = wl2;
+        foreach (c; 0 .. window.length!1)
+        {
+            cd = j * j;
+            auto cdiff = cast(float)(window[r, c] - in_val);
+            c_val = exp((cd + rd) / (-2.0f * sigmaCol * sigmaCol));
+            s_val =  exp((cdiff*cdiff) / (-2.0f * sigmaSpace * sigmaSpace));
+            mask[r, c] = c_val * s_val;
+            j++;
+        }
+        i++;
+    }
+}
+
+@fastmath T calcBilateralValue(T, M)(T r, T i, M m) 
+{
+    return cast(T)(r + i*m);
+}
+
 private:
 
+void nonMaximaSupressionImpl(DataPack, MagWindow)(DataPack p, MagWindow magWin)
+{
+    alias F = typeof(p.result);
+
+    auto ang = p.ang;
+    auto aang = abs(ang);
+
+    auto mag = magWin[1, 1];
+    typeof(mag) mb, ma; // magnitude before and after cursor
+
+    immutable pi = 3.15f;
+    immutable pi8 = pi / 8.0f;
+
+    if (aang <= pi && aang > 7.0f * pi8)
+    {
+        mb = magWin[1, 0];
+        ma = magWin[1, 2];
+    }
+    else if (ang >= -7.0f * pi8 && ang < -5.0f * pi8)
+    {
+        mb = magWin[0, 0];
+        ma = magWin[2, 2];
+    }
+    else if (ang <= 7.0f * pi8 && ang > 5.0f * pi8)
+    {
+        mb = magWin[0, 2];
+        ma = magWin[2, 0];
+    }
+    else if (ang >= pi8 && ang < 3.0f * pi8)
+    {
+        mb = magWin[2, 0];
+        ma = magWin[0, 2];
+    }
+    else if (ang <= -pi8 && ang > -3.0f * pi8)
+    {
+        mb = magWin[2, 2];
+        ma = magWin[0, 0];
+    }
+    else if (ang >= -5.0f * pi8 && ang < -3.0f * pi8)
+    {
+        mb = magWin[0, 1];
+        ma = magWin[2, 1];
+    }
+    else if (ang <= 5.0f * pi8 && ang > 3.0f * pi8)
+    {
+        mb = magWin[2, 1];
+        ma = magWin[0, 1];
+    }
+    else if (aang >= 0.0f && aang < pi8)
+    {
+        mb = magWin[1, 2];
+        ma = magWin[1, 0];
+    }
+
+    p.result = cast(F)((ma > mb) ? 0 : mag);
+}
+
+auto bilateralFilterImpl(Window, Mask)(Window window, Mask mask, float sigmaCol, float sigmaSpace)
+{
+    import mir.glas.l1 : asum;
+
+    calcBilateralMask(window, mask, sigmaCol, sigmaSpace);
+    auto mask_norm = 1.0f / float(mask.asum);
+    mask.ndEach!( (ref v) { v *= mask_norm; }, Yes.vectorized, Yes.fastmath);
+
+    alias T = DeepElementType!Window;
+    alias M = DeepElementType!Mask;
+
+    return ndReduce!(calcBilateralValue!(T, M), Yes.vectorized, Yes.fastmath)(T(0), window, mask);
+}
+
 void medianFilterImpl1(alias bc, T, O)(Slice!(1, T*) slice, Slice!(1, O*) filtered,
     size_t kernelSize, TaskPool pool)
 {
@@ -1387,3 +1423,31 @@ body
     return prealloc;
 }
 
+void filterNonMaximumImpl(Window)(Window window)
+{
+    alias T = DeepElementType!Window;
+
+    static if (isFloatingPoint!T)
+        auto lmsVal = -T.max;
+    else
+        auto lmsVal = T.min;
+
+    T *locPtr = null;
+
+    foreach(row; window)
+        foreach(ref e; row)
+        {
+            if (e > lmsVal)
+            {
+                locPtr = &e;
+                lmsVal = e;
+            }
+            e = T(0);
+        }
+
+    if (locPtr !is null)
+    {
+        *locPtr = lmsVal;
+    }
+}
+
diff --git a/source/dcv/imgproc/imgmanip.d b/source/dcv/imgproc/imgmanip.d
index fc4b21f..3101914 100644
--- a/source/dcv/imgproc/imgmanip.d
+++ b/source/dcv/imgproc/imgmanip.d
@@ -175,10 +175,11 @@ Params:
 Returns:
     Warped input image by given map.
 */
-Slice!(N, T*) warp(alias interp = linear, size_t N, T, V)(Slice!(N, T*) image, Slice!(3, V*) map,
-        Slice!(N, T*) prealloc = emptySlice!(N, T)) pure
+pure auto warp(alias interp = linear, ImageTensor, MapTensor)(ImageTensor image, MapTensor map,
+        ImageTensor prealloc = ImageTensor.init)
 {
-    return pixelWiseDisplacementImpl!(linear, warpImpl, N, T, V)(image, map, prealloc);
+    return pixelWiseDisplacement!(linear, DisplacementType.WARP, ImageTensor, MapTensor)
+        (image, map, prealloc);
 }
 
 /**
@@ -198,10 +199,11 @@ Params:
 Returns:
     Remapped input image by given map.
 */
-Slice!(N, T*) remap(alias interp = linear, size_t N, T, V)(Slice!(N, T*) image, Slice!(3,
-        V*) map, Slice!(N, T*) prealloc = emptySlice!(N, T)) pure
+pure auto remap(alias interp = linear, ImageTensor, MapTensor)(ImageTensor image, MapTensor map,
+        ImageTensor prealloc = ImageTensor.init)
 {
-    return pixelWiseDisplacementImpl!(linear, remapImpl, N, T, V)(image, map, prealloc);
+    return pixelWiseDisplacement!(linear, DisplacementType.REMAP, ImageTensor, MapTensor)
+        (image, map, prealloc);
 }
 
 /// Test if warp and remap always returns slice of corresponding format.
@@ -257,90 +259,87 @@ unittest
     assert(&remapped[0, 0] == &remappedRetVal[0, 0]);
 }
 
-private
+private enum DisplacementType
 {
+    WARP,
+    REMAP
+}
 
-    Slice!(N, T*) pixelWiseDisplacementImpl(alias interp, alias dispFunc, size_t N, T, V)(Slice!(N,
-            T*) image, Slice!(3, V*) map, Slice!(N, T*) prealloc = emptySlice!(N, T)) pure
-    {
-        static assert(isNumeric!T);
-        import std.algorithm.comparison : equal;
-        import std.algorithm.iteration : reduce;
-        import std.array : array, uninitializedArray;
-
-        typeof(image) warped;
-        if (prealloc.empty || !prealloc.shape.array.equal(image.shape.array))
-        {
-            warped = uninitializedArray!(T[])(image.elementsCount).sliced(image.shape);
-        }
-        else
-        {
-            enforce(&(warped.byElement.front) != &(image.byElement.front),
-                    "Invalid preallocation slice - it must not share data with input image slice");
-            warped = prealloc;
-        }
+private pure auto pixelWiseDisplacement(alias interp, DisplacementType disp, ImageTensor, MapTensor)
+    (ImageTensor image, MapTensor map, ImageTensor prealloc)
+in
+{
+    assert(!image.empty, "Input image is empty");
+    assert(map.shape[0 .. 2] == image.shape[0 .. 2], "Invalid map size.");
+}
+body
+{
+    import std.traits : ReturnType;
 
-        static if (N == 2)
-        {
-            dispFunc!(interp, T, V)(image, map, warped);
-        }
-        else static if (N == 3)
-        {
-            auto imp = image.pack!1;
-            foreach (i; 0 .. image.length!2)
-            {
-                dispFunc!(interp, T, V)(image[0 .. $, 0 .. $, i], map, warped[0 .. $, 0 .. $, i]);
-            }
-        }
-        else
-        {
-            import std.conv : to;
+    static assert(isSlice!ImageTensor, "Image type has to be of type mir.ndslice.slice.Slice");
+    static assert(isSlice!MapTensor, "Map type has to be of type mir.ndslice.slice.Slice");
+    immutable N = ReturnType!(ImageTensor.shape).length;
+    static assert(ReturnType!(MapTensor.shape).length == 3,
+            "Invalid map tensor dimension - should be matrix of [x, y] displacements (3D).");
 
-            static assert(0, "Invalid slice dimension - " ~ N.to!string);
-        }
-        return warped;
+    if (prealloc.shape != image.shape)
+    {
+        prealloc = uninitializedSlice!(DeepElementType!ImageTensor)(image.shape);
     }
 
-    Slice!(2, T*) warpImpl(alias interp, T, V)(Slice!(2, T*) image, Slice!(3, V*) map, Slice!(2, T*) warped) pure
+    static if (N == 2)
+    {
+        displacementImpl!(interp, disp, ImageTensor, MapTensor)
+            (image, map, prealloc);
+    }
+    else static if (N == 3)
     {
-        auto const rows = image.length!0;
-        auto const cols = image.length!1;
-        const auto rf = cast(float)rows;
-        const auto cf = cast(float)cols;
-        foreach (i; 0 .. rows)
+        foreach (i; 0 .. image.length!2)
         {
-            foreach (j; 0 .. cols)
-            {
-                auto x = cast(float)i + map[i, j, 1];
-                auto y = cast(float)j + map[i, j, 0];
-                if (x >= 0.0f && x < rf && y >= 0.0f && y < cf)
-                {
-                    warped[i, j] = interp(image, x, y);
-                }
-            }
+            auto imagec = image[0 .. $, 0 .. $, i];
+            auto resultc = prealloc[0 .. $, 0 .. $, i];
+            displacementImpl!(interp, disp, typeof(imagec), MapTensor)
+                (imagec, map, resultc);
         }
-        return warped;
     }
+    else
+        static assert(0, "Pixel displacement operations are supported only for 2D and 3D slices.");
 
-    Slice!(2, T*) remapImpl(alias interp, T, V)(Slice!(2, T*) image, Slice!(3, V*) map, Slice!(2, T*) remapped) pure
+    return prealloc;
+}
+
+private pure void displacementImpl(alias interp, DisplacementType disp, ImageMatrix, MapTensor)
+    (ImageMatrix image, MapTensor map, ImageMatrix result)
+{
+    static if (disp == DisplacementType.WARP)
+        float r = 0.0f, c;
+    immutable rf = cast(float)image.length!0;
+    immutable cf = cast(float)image.length!1;
+    for (; !result.empty; result.popFront, map.popFront)
     {
-        auto const rows = image.length!0;
-        auto const cols = image.length!1;
-        const auto rf = cast(float)rows;
-        const auto cf = cast(float)cols;
-        foreach (i; 0 .. rows)
+        auto rrow = result.front;
+        auto mrow = map.front;
+        static if (disp == DisplacementType.WARP) c = 0.0f;
+        for (; !rrow.empty; rrow.popFront, mrow.popFront)
         {
-            foreach (j; 0 .. cols)
+            auto m = mrow.front;
+            static if (disp == DisplacementType.WARP)
             {
-                auto x = map[i, j, 1];
-                auto y = map[i, j, 0];
-                if (x >= 0.0f && x < rf && y >= 0.0f && y < cf)
-                {
-                    remapped[i, j] = interp(image, x, y);
-                }
+                float rr = r + m[1];
+                float cc = c + m[0];
+            }
+            else
+            {
+                float rr = m[1];
+                float cc = m[0];
+            }
+            if (rr >= 0.0f && rr < rf && cc >= 0.0f && cc < cf)
+            {
+                rrow.front = interp(image, rr, cc);
             }
+            static if (disp == DisplacementType.WARP) ++c;
         }
-        return remapped;
+        static if (disp == DisplacementType.WARP) ++r;
     }
 }
 
diff --git a/source/dcv/imgproc/interpolate.d b/source/dcv/imgproc/interpolate.d
index 2c3e4f6..9badc87 100644
--- a/source/dcv/imgproc/interpolate.d
+++ b/source/dcv/imgproc/interpolate.d
@@ -12,10 +12,12 @@ module dcv.imgproc.interpolate;
 import std.range : isRandomAccessRange, ElementType;
 import std.traits : isNumeric, isScalarType, isIntegral, allSameType, allSatisfy, ReturnType,
     isFloatingPoint;
-import std.exception;
+
+import ldc.attributes : fastmath;
 
 import mir.ndslice;
 
+
 /**
 Test if given function is proper form for interpolation.
 */
@@ -59,7 +61,7 @@ Params:
 Returns:
     Interpolated resulting value.
 */
-T linear(T, size_t N, Position...)(Slice!(N, T*) slice, Position pos) pure 
+pure T linear(T, size_t N, Position...)(Slice!(N, T*) slice, Position pos)
         if (isNumeric!T && isScalarType!T && allSameType!Position && allSatisfy!(isNumeric, Position))
 {
     // TODO: document
@@ -104,9 +106,9 @@ unittest
 
 private:
 
-auto linearImpl_1(T)(Slice!(1, T*) range, double pos) pure
+pure @fastmath auto linearImpl_1(T)(Slice!(1, T*) range, double pos)
 {
-    import std.math : floor;
+    import ldc.intrinsics : floor = llvm_floor;
 
     assert(pos < range.length);
 
@@ -132,9 +134,9 @@ auto linearImpl_1(T)(Slice!(1, T*) range, double pos) pure
     return cast(T)(v1 * (1. - weight) + v2 * (weight));
 }
 
-auto linearImpl_2(T)(Slice!(2, T*) range, double pos_x, double pos_y) pure
+pure @fastmath auto linearImpl_2(T)(Slice!(2, T*) range, double pos_x, double pos_y)
 {
-    import std.math : floor;
+    import ldc.intrinsics : floor = llvm_floor;
 
     assert(pos_x < range.length!0 && pos_y < range.length!1);
 
@@ -170,3 +172,4 @@ auto linearImpl_2(T)(Slice!(2, T*) range, double pos_x, double pos_y) pure
     }
     return cast(T)(v1 * w00 + v2 * w01 + v3 * w10 + v4 * w11);
 }
+
diff --git a/source/dcv/imgproc/threshold.d b/source/dcv/imgproc/threshold.d
index c2d9fd7..7dda90a 100644
--- a/source/dcv/imgproc/threshold.d
+++ b/source/dcv/imgproc/threshold.d
@@ -10,6 +10,7 @@ License: $(LINK3 http://www.boost.org/LICENSE_1_0.txt, Boost Software License -
 module dcv.imgproc.threshold;
 
 import mir.ndslice;
+import mir.ndslice.algorithm : ndEach, Yes;
 
 import dcv.core.utils : emptySlice;
 
@@ -33,65 +34,55 @@ Params:
     lowThresh = Lower threshold value.
     highThresh = Higher threshold value.
     prealloc = Optional pre-allocated slice buffer for output.
+
+Note:
+    Input and pre-allocated buffer slice, should be of same structure
+    (i.e. have same strides). If prealloc buffer is not given, and is
+    allocated anew, input slice memory must be contiguous.
 */
-Slice!(N, OutputType*) threshold(OutputType, InputType, size_t N)(Slice!(N, InputType*) slice,
+nothrow Slice!(N, OutputType*) threshold(OutputType, InputType, size_t N)(Slice!(N, InputType*) input,
         InputType lowThresh, InputType highThresh, Slice!(N, OutputType*) prealloc = emptySlice!(N, OutputType))
 in
 {
-
     //TODO: consider leaving upper value, and not setting it to 1.
     assert(lowThresh <= highThresh);
-    assert(!slice.empty);
+    assert(!input.empty);
 }
 body
 {
-    import std.array : uninitializedArray;
-    import std.algorithm.iteration : reduce;
-    import std.range : lockstep;
     import std.math : approxEqual;
-    import std.traits : isFloatingPoint;
+    import std.traits : isFloatingPoint, isNumeric;
+
+    static assert(isNumeric!OutputType, "Invalid output type - has to be numeric.");
 
-    if (prealloc.shape[] != slice.shape[])
+    if (prealloc.shape != input.shape)
     {
-        prealloc = uninitializedArray!(OutputType[])(slice.elementsCount).sliced(slice.shape);
+        prealloc = uninitializedSlice!OutputType(input.shape);
     }
 
+    assert(input.structure.strides == prealloc.structure.strides,
+            "Input slice structure does not match with resulting buffer.");
+
     static if (isFloatingPoint!OutputType)
-    {
         OutputType upvalue = 1.0;
-    }
     else
-    {
         OutputType upvalue = OutputType.max;
-    }
-
-    pure nothrow @safe @nogc InputType cmp_l(ref InputType v)
-    {
-        return v <= lowThresh ? 0 : upvalue;
-    }
-
-    pure nothrow @safe @nogc InputType cmp_lu(ref InputType v)
-    {
-        return v >= lowThresh && v <= highThresh ? upvalue : 0;
-    }
 
-    InputType delegate(ref InputType v) pure nothrow @nogc @safe cmp;
+    auto p = assumeSameStructure!("result", "input")(prealloc, input);
 
     if (lowThresh.approxEqual(highThresh))
     {
-        cmp = &cmp_l;
+        p.ndEach!((v)
+        {
+            v.result = cast(OutputType)(v.input <= lowThresh ? 0 : upvalue);
+        }, Yes.vectorized, Yes.fastmath);
     }
     else
     {
-        cmp = &cmp_lu;
-    }
-
-    foreach (ref t, e; lockstep(prealloc.byElement, slice.byElement))
-    {
-        static if (is(InputType == OutputType))
-            t = cmp(e); //(e >= lowThresh && e <= highThresh) ? e : cast(OutputType)0;
-        else
-            t = cast(OutputType)cmp(e);
+        p.ndEach!((v)
+        {
+            v.result = cast(OutputType)(v.input >= lowThresh && v.input <= highThresh ? upvalue : 0);
+        }, Yes.vectorized, Yes.fastmath);
     }
 
     return prealloc;
@@ -106,8 +97,13 @@ Params:
     slice = Input slice.
     thresh = Threshold value - any value lower than this will be set to 0, and higher to 1.
     prealloc = Optional pre-allocated slice buffer for output.
+
+Note:
+    Input and pre-allocated buffer slice, should be of same structure
+    (i.e. have same strides). If prealloc buffer is not given, and is
+    allocated anew, input slice memory must be contiguous.
 */
-Slice!(N, OutputType*) threshold(OutputType, InputType, size_t N)(Slice!(N, InputType*) slice,
+nothrow Slice!(N, OutputType*) threshold(OutputType, InputType, size_t N)(Slice!(N, InputType*) slice,
         InputType thresh, Slice!(N, OutputType*) prealloc = emptySlice!(N, OutputType))
 {
     return threshold!(OutputType, InputType, N)(slice, thresh, thresh, prealloc);
diff --git a/source/dcv/multiview/stereo/matching.d b/source/dcv/multiview/stereo/matching.d
index e5b3e24..1052aac 100644
--- a/source/dcv/multiview/stereo/matching.d
+++ b/source/dcv/multiview/stereo/matching.d
@@ -36,10 +36,11 @@ import std.functional;
 import std.math;
 
 import mir.ndslice;
+import mir.ndslice.algorithm : Yes;
 
 import dcv.core;
 import dcv.core.image;
-import dcv.core.utils : emptySlice;
+import dcv.core.utils : emptySlice, clip;
 import dcv.imgproc;
 
 alias DisparityType = uint;
diff --git a/tests/performance-tests/dub.json b/tests/performance-tests/dub.json
index 4bc6c8b..1becb4e 100644
--- a/tests/performance-tests/dub.json
+++ b/tests/performance-tests/dub.json
@@ -3,6 +3,7 @@
     "description": "Performance test for DCV.",
     "targetType": "executable",
     "sourcePaths": ["source"],
+    "dflags-ldc": ["-mcpu=native"],
     "dependencies": 
     {
         "dcv:core": {"path": "../../"},
diff --git a/tests/performance-tests/source/performance/measure.d b/tests/performance-tests/source/performance/measure.d
index b6a32cc..526c100 100644
--- a/tests/performance-tests/source/performance/measure.d
+++ b/tests/performance-tests/source/performance/measure.d
@@ -93,28 +93,28 @@ auto run_dcv_features_corner_harris_harrisCorners_3()
 {
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&harrisCorners!(float, float), image, 3, 0.64, 0.84, result);
+    return evalBenchmark(&harrisCorners!(float, float), image, 3, 0.64, 0.84, result, taskPool);
 }
 
 auto run_dcv_features_corner_harris_harrisCorners_5()
 {
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&harrisCorners!(float, float), image, 5, 0.64, 0.84, result);
+    return evalBenchmark(&harrisCorners!(float, float), image, 5, 0.64, 0.84, result, taskPool);
 }
 
 auto run_dcv_features_corner_harris_shiTomasiCorners_3()
 {
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&shiTomasiCorners!(float, float), image, 3, 0.84, result);
+    return evalBenchmark(&shiTomasiCorners!(float, float), image, 3, 0.84, result, taskPool);
 }
 
 auto run_dcv_features_corner_harris_shiTomasiCorners_5()
 {
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&shiTomasiCorners!(float, float), image, 5, 0.84, result);
+    return evalBenchmark(&shiTomasiCorners!(float, float), image, 5, 0.84, result, taskPool);
 }
 
 auto run_dcv_features_corner_fast_FASTDetector()
@@ -188,8 +188,10 @@ auto run_dcv_imgproc_color_rgb2hsv()
 
 auto run_dcv_imgproc_color_hsv2rgb()
 {
+    import std.random;
     auto rgb = slice!ubyte(imsize, imsize, 3);
     auto hsv = slice!float(imsize, imsize, 3);
+    hsv.ndEach!(v => v = cast(float)uniform01);
     return evalBenchmark(&hsv2rgb!(ubyte, float), hsv, rgb);
 }
 
@@ -212,7 +214,7 @@ auto run_dcv_imgproc_convolution_conv_1D_3()
     auto vector = slice!float(imsize * imsize);
     auto result = slice!float(imsize * imsize);
     auto kernel = slice!float(3);
-    return evalBenchmark(&conv!(neumann, float, float, float, 1, 1), vector, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(vector), typeof(kernel), typeof(kernel)), vector, kernel, result,
             emptySlice!(1, float), taskPool);
 }
 
@@ -221,7 +223,7 @@ auto run_dcv_imgproc_convolution_conv_1D_5()
     auto vector = slice!float(imsize * imsize);
     auto result = slice!float(imsize * imsize);
     auto kernel = slice!float(5);
-    return evalBenchmark(&conv!(neumann, float, float, float, 1, 1), vector, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(vector), typeof(kernel), typeof(kernel)), vector, kernel, result,
             emptySlice!(1, float), taskPool);
 }
 
@@ -230,7 +232,7 @@ auto run_dcv_imgproc_convolution_conv_1D_7()
     auto vector = slice!float(imsize * imsize);
     auto result = slice!float(imsize * imsize);
     auto kernel = slice!float(7);
-    return evalBenchmark(&conv!(neumann, float, float, float, 1, 1), vector, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(vector), typeof(kernel), typeof(kernel)), vector, kernel, result,
             emptySlice!(1, float), taskPool);
 }
 
@@ -239,7 +241,7 @@ auto run_dcv_imgproc_convolution_conv_2D_3x3()
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
     auto kernel = slice!float(3, 3);
-    return evalBenchmark(&conv!(neumann, float, float, float, 2, 2), image, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(image), typeof(kernel), typeof(kernel)), image, kernel, result,
             emptySlice!(2, float), taskPool);
 }
 
@@ -248,7 +250,7 @@ auto run_dcv_imgproc_convolution_conv_2D_5x5()
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
     auto kernel = slice!float(5, 5);
-    return evalBenchmark(&conv!(neumann, float, float, float, 2, 2), image, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(image), typeof(kernel), typeof(kernel)), image, kernel, result,
             emptySlice!(2, float), taskPool);
 }
 
@@ -257,7 +259,7 @@ auto run_dcv_imgproc_convolution_conv_2D_7x7()
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
     auto kernel = slice!float(7, 7);
-    return evalBenchmark(&conv!(neumann, float, float, float, 2, 2), image, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(image), typeof(kernel), typeof(kernel)), image, kernel, result,
             emptySlice!(2, float), taskPool);
 }
 
@@ -266,7 +268,7 @@ auto run_dcv_imgproc_convolution_conv_3D_3x3()
     auto image = slice!float(imsize, imsize, 3);
     auto result = slice!float(imsize, imsize, 3);
     auto kernel = slice!float(3, 3);
-    return evalBenchmark(&conv!(neumann, float, float, float, 3, 2), image, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(image), typeof(kernel), typeof(kernel)), image, kernel, result,
             emptySlice!(2, float), taskPool);
 }
 
@@ -275,14 +277,14 @@ auto run_dcv_imgproc_convolution_conv_3D_5x5()
     auto image = slice!float(imsize, imsize, 3);
     auto result = slice!float(imsize, imsize, 3);
     auto kernel = slice!float(5, 5);
-    return evalBenchmark(&conv!(neumann, float, float, float, 3, 2), image, kernel, result,
+    return evalBenchmark(&conv!(neumann, typeof(image), typeof(kernel), typeof(kernel)), image, kernel, result,
             emptySlice!(2, float), taskPool);
 }
 
 auto run_dcv_imgproc_filter_filterNonMaximum()
 {
     auto image = slice!float(imsize, imsize);
-    return evalBenchmark(&filterNonMaximum!float, image, 10);
+    return evalBenchmark(&filterNonMaximum!(typeof(image)), image, 10);
 }
 
 auto run_dcv_imgproc_filter_calcPartialDerivatives()
@@ -290,7 +292,7 @@ auto run_dcv_imgproc_filter_calcPartialDerivatives()
     auto image = slice!float(imsize, imsize);
     auto fx = slice!float(imsize, imsize);
     auto fy = slice!float(imsize, imsize);
-    return evalBenchmark(&calcPartialDerivatives!float, image, fx, fy);
+    return evalBenchmark(&calcPartialDerivatives!(typeof(image), float), image, fx, fy, taskPool);
 }
 
 auto run_dcv_imgproc_filter_calcGradients()
@@ -298,7 +300,7 @@ auto run_dcv_imgproc_filter_calcGradients()
     auto image = slice!float(imsize, imsize);
     auto mag = slice!float(imsize, imsize);
     auto orient = slice!float(imsize, imsize);
-    return evalBenchmark(&calcGradients!(float), image, mag, orient, EdgeKernel.SIMPLE);
+    return evalBenchmark(&calcGradients!(typeof(image), float), image, mag, orient, EdgeKernel.SIMPLE, taskPool);
 }
 
 auto run_dcv_imgproc_filter_nonMaximaSupression()
@@ -306,7 +308,7 @@ auto run_dcv_imgproc_filter_nonMaximaSupression()
     auto mag = slice!float(imsize, imsize);
     auto orient = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&nonMaximaSupression!(float, float), mag, orient, result);
+    return evalBenchmark(&nonMaximaSupression!(typeof(mag), float), mag, orient, result, taskPool);
 }
 
 auto run_dcv_imgproc_filter_canny()
@@ -316,7 +318,7 @@ auto run_dcv_imgproc_filter_canny()
     auto result = slice!ubyte(imsize, imsize);
     auto runCanny(typeof(image) image, typeof(result) result)
     {
-        canny!ubyte(image, 0, 1, EdgeKernel.SOBEL, result);
+        canny!ubyte(image, 0, 1, EdgeKernel.SOBEL, result, taskPool);
     }
     //return evalBenchmark(&canny!(float,ubyte), image, cast(ubyte)0, cast(ubyte)1, EdgeKernel.SOBEL, result);
     return evalBenchmark(&runCanny, image, result);
@@ -326,14 +328,14 @@ auto run_dcv_imgproc_filter_bilateralFilter_3()
 {
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&bilateralFilter!(neumann, float, float, 2), image, 0.84, 3, result, taskPool);
+    return evalBenchmark(&bilateralFilter!(neumann, typeof(image), float, 2), image, 0.84, 3, result, taskPool);
 }
 
 auto run_dcv_imgproc_filter_bilateralFilter_5()
 {
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
-    return evalBenchmark(&bilateralFilter!(neumann, float, float, 2), image, 0.84, 5, result, taskPool);
+    return evalBenchmark(&bilateralFilter!(neumann, typeof(image), float, 2), image, 0.84, 5, result, taskPool);
 }
 
 auto run_dcv_imgproc_filter_medianFilter_3()
@@ -435,7 +437,7 @@ auto run_dcv_imgproc_imgmanip_warp()
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
     auto warpMap = slice!float(imsize, imsize, 2);
-    return evalBenchmark(&warp!(linear, 2, float, float), image, warpMap, result);
+    return evalBenchmark(&warp!(linear, typeof(image), typeof(warpMap)), image, warpMap, result);
 }
 
 auto run_dcv_imgproc_imgmanip_remap()
@@ -443,7 +445,7 @@ auto run_dcv_imgproc_imgmanip_remap()
     auto image = slice!float(imsize, imsize);
     auto result = slice!float(imsize, imsize);
     auto remapMap = slice!float(imsize, imsize, 2);
-    return evalBenchmark(&remap!(linear, 2, float, float), image, remapMap, result);
+    return evalBenchmark(&remap!(linear, typeof(image), typeof(remapMap)), image, remapMap, result);
 }
 
 auto run_dcv_imgproc_threshold_threshold()