From 47cc5dcff519d0be1e206bfdc52121a44d345e98 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Sun, 10 Oct 2021 22:50:19 -0400
Subject: [PATCH] v3.18.1

---
 INSTALL_LINUX                  |   22 +-
 RELEASE_NOTES                  |   16 +-
 algo/scrypt/scrypt-core-4way.c | 2886 ++++++++++++--------------------
 algo/scrypt/scrypt.c           |  456 +++--
 algo/sha/sha-hash-4way.h       |    3 -
 algo/sha/sha256-hash-4way.c    |  881 ++--------
 algo/shabal/shabal-hash-4way.c |   13 +-
 algo/swifftx/inttypes.h        |    8 +-
 algo/swifftx/swifftx.c         |  409 ++++-
 configure                      |   20 +-
 configure.ac                   |    2 +-
 cpu-miner.c                    |   28 +-
 simd-utils/simd-256.h          |   48 +-
 simd-utils/simd-512.h          |    4 -
 14 files changed, 2013 insertions(+), 2783 deletions(-)

diff --git a/INSTALL_LINUX b/INSTALL_LINUX
index a88f888c..24927b46 100644
--- a/INSTALL_LINUX
+++ b/INSTALL_LINUX
@@ -32,14 +32,26 @@ but different package names.
 $ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
 
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
-support depending on your CPU and compiler version:
+openssl 1.1.0e or higher.
 
-"-march=native" is always the best choice
+znver1 and znver2 should be recognized on most recent version of GCC and
+znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
+In the meantime here are some suggestions to compile with new CPUs:
 
-"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+"-march=native" is usually the best choice, used by build.sh.
 
-"-msha"  Add SHA to other tuning options
+"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
+
+"-mcascadelake -msha" or
+"-mcometlake -mavx512 -msha" can be used for Rocket Lake.
+
+Features can also be added individually:
+
+"-msha" adds support for HW accelerated sha256.
+
+"-mavx512" adds support for 512 bit vectors
+
+"-mvaes" add support for parallel AES
 
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 056491f7..ef3f912f 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,10 +65,24 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.18.1
+
+More speed for scrypt:
+ - additional scryptn2 optimizations for all CPU architectures,
+ - AVX2 is now used by default on CPUS with SHA but not AVX512,
+ - scrypt:1024 performance lost in v3.18.0 is restored,
+ - AVX512 & AVX2 improvements to scrypt:1024.
+
+Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
+
+Issue #337: fixed a problem that could display negative stats values in the
+first summary report if the report was forced prematurely due to a stratum
+diff change. The stats will still be invalid but should display zeros.
+
 v3.18.0
 
 Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
-  - AVX512 & SHA support for SHA256, AVX512 has priority,
+  - AVX512 & SHA support for sha256, AVX512 has priority,
   - up to 50% increase in hashrate,
   - memory requirements reduced 30-60% depending on CPU architecture,
   - memory usage displayed at startup,
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
index 19ff9cdd..1039c3fc 100644
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -116,23 +116,6 @@ do{ \
    c1 = XOR( c1, tc ); \
 } while (0);
 
-// use 16 regs   AVX, AVX2, 8 buf for AVX512?
-#define ARX_4BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3, n ) \
-do{ \
-   TYPE ta = ADD32( a2, a3 ); \
-   TYPE tb = ADD32( b2, b3 ); \
-   TYPE tc = ADD32( c2, c3 ); \
-   TYPE td = ADD32( d2, d3 ); \
-   ta = ROL32( ta, n ); \
-   tb = ROL32( tb, n ); \
-   tc = ROL32( tc, n ); \
-   td = ROL32( td, n ); \
-   a1 = XOR( a1, ta ); \
-   b1 = XOR( b1, tb ); \
-   c1 = XOR( c1, tc ); \
-   d1 = XOR( d1, td ); \
-} while (0);
-
 
 // Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 &
 // ROR_1X32 defined.
@@ -208,95 +191,127 @@ do{ \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 );
 
-#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
-   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
-   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
-   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
-   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
+// For use when fast bit rotate is not available.
+// contains target specif instructions, only use with 128 bit vectrors.
+#define SALSA_2ROUNDS_SIMD128_2BUF_SLOROT \
+do{ \
+   TYPE TA = ADD32( XA0, XA3 ); \
+   TYPE TB = ADD32( XB0, XB3 ); \
+   TYPE T  = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
+   XA1 = XOR( XA1, T  ); \
+   XA1 = XOR( XA1, TA  ); \
+   T = _mm_slli_epi32( TB, 7 );\
+   TB = _mm_srli_epi32( TB, 25 ); \
+   XB1 = XOR( XB1, T ); \
+   XB1 = XOR( XB1, TB ); \
+\
+   TA = ADD32( XA1, XA0 ); \
+   TB = ADD32( XB1, XB0 ); \
+   T  = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
+   XA2 = XOR( XA2, TA ); \
+   T = _mm_slli_epi32( TB, 9 );\
+   TB = _mm_srli_epi32( TB, 23 );\
+   XB2 = XOR( XB2, T ); \
+   XB2 = XOR( XB2, TB ); \
+\
+   TA = ADD32( XA2, XA1 ); \
+   TB = ADD32( XB2, XB1 ); \
+   T  = _mm_slli_epi32( TA, 13); \
+   TA = _mm_srli_epi32( TA, 19 ); \
    XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
-   XA3 = ROR_1X32( XA3 ); \
-   XB3 = ROR_1X32( XB3 ); \
+   XA3 = XOR( XA3, T ); \
+   XA3 = XOR( XA3, TA ); \
+   T  = _mm_slli_epi32( TB, 13); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XB3 = XOR( XB3, T ); \
+   XB3 = XOR( XB3, TB ); \
+\
+   TA = ADD32( XA3, XA2 ); \
+   TB = ADD32( XB3, XB2 ); \
+   T  = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
-   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
-   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
-   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
-
-// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-//                XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3 )
-#define SALSA_2ROUNDS_SIMD128_4BUF \
-   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
-             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
-   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
-             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
-   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
-             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
-   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
-             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
-   XA1 = ROL_1X32( XA1 ); \
-   XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
-   XD1 = ROL_1X32( XD1 ); \
+   XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
+   T  = _mm_slli_epi32( TB, 18 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
+   XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
+\
+   TA = ADD32( XA0, XA1 ); \
+   TB = ADD32( XB0, XB1 ); \
+   T = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
    XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, T ); \
+   XA3 = XOR( XA3, TA ); \
+   T = _mm_slli_epi32( TB, 7 ); \
+   TB = _mm_srli_epi32( TB, 25 ); \
    XB3 = ROR_1X32( XB3 ); \
-   XC3 = ROR_1X32( XC3 ); \
-   XD3 = ROR_1X32( XD3 ); \
-   XA2 = SWAP_64( XA2 ); \
-   XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
-   XD2 = SWAP_64( XD2 ); \
-   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
-             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
-   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
-             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
-   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
-             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
-   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
-             XC0, XC1, XC2, XD0, XD1, XD2, 18 ); \
+   XB3 = XOR( XB3, T ); \
+   XB3 = XOR( XB3, TB ); \
+\
+   TA = ADD32( XA3, XA0 ); \
+   TB = ADD32( XB3, XB0 ); \
+   T = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
+   XA2 = XOR( XA2, TA ); \
+   T = _mm_slli_epi32( TB, 9 ); \
+   TB = _mm_srli_epi32( TB, 23 ); \
+   XB2 = XOR( XB2, T ); \
+   XB2 = XOR( XB2, TB ); \
+\
+   TA = ADD32( XA2, XA3 ); \
+   TB = ADD32( XB2, XB3 ); \
+   T = _mm_slli_epi32( TA, 13 ); \
+   TA = _mm_srli_epi32( TA, 19 ); \
    XA3 = ROL_1X32( XA3 ); \
    XB3 = ROL_1X32( XB3 ); \
-   XC3 = ROL_1X32( XC3 ); \
-   XD3 = ROL_1X32( XD3 ); \
-   XA1 = ROR_1X32( XA1 ); \
-   XB1 = ROR_1X32( XB1 ); \
-   XC1 = ROR_1X32( XC1 ); \
-   XD1 = ROR_1X32( XD1 ); \
+   XA1 = XOR( XA1, T ); \
+   XA1 = XOR( XA1, TA ); \
+   T = _mm_slli_epi32( TB, 13 ); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XB1 = XOR( XB1, T ); \
+   XB1 = XOR( XB1, TB ); \
+\
+   TA = ADD32( XA1, XA2 ); \
+   TB = ADD32( XB1, XB2 ); \
+   T = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
-   XD2 = SWAP_64( XD2 );
-
-#define SALSA_2ROUNDS_FINAL_SIMD128_4BUF \
-   ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
-             XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \
-   ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
-             XC2, XC1, XC0, XD2, XD1, XD0,  9 ); \
-   ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
-             XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \
-   ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
-             XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \
+   XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
+   T = _mm_slli_epi32( TB, 18 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
+   XA1 = ROR_1X32( XA1 ); \
+   XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
+   XB1 = ROR_1X32( XB1 ); \
+} while (0);
+
+#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
+   ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3,  7 ); \
+   ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0,  9 ); \
+   ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
+   ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
    XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
-   XD1 = ROL_1X32( XD1 ); \
    XA3 = ROR_1X32( XA3 ); \
    XB3 = ROR_1X32( XB3 ); \
-   XC3 = ROR_1X32( XC3 ); \
-   XD3 = ROR_1X32( XD3 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
-   XD2 = SWAP_64( XD2 ); \
-   ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
-             XC3, XC0, XC1, XD3, XD0, XD1,  7 ); \
-   ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
-             XC2, XC3, XC0, XD2, XD3, XD0,  9 ); \
-   ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
-             XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \
-   ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
-             XC0, XC1, XC2, XD0, XD1, XD2, 18 );
+   ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1,  7 ); \
+   ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0,  9 ); \
+   ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
+   ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
+
 
 // Inlined ARX
 #define SALSA_2ROUNDS_SIMD128_3BUF \
@@ -402,7 +417,8 @@ do{ \
    
 
 // slow rol, an attempt to optimze non-avx512 bit rotations
-#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROL \
+// Contains target specific instructions, only for use with 128 bit vectors
+#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
 do{ \
    TYPE TA = ADD32( XA0, XA3 ); \
    TYPE TB = ADD32( XB0, XB3 ); \
@@ -410,14 +426,14 @@ do{ \
    TYPE T  = _mm_slli_epi32( TA, 7 ); \
    TA = _mm_srli_epi32( TA, 25 ); \
    XA1 = XOR( XA1, T  ); \
-   T = _mm_slli_epi32( TB, 7 );\
    XA1 = XOR( XA1, TA  ); \
+   T = _mm_slli_epi32( TB, 7 );\
    TB = _mm_srli_epi32( TB, 25 ); \
    XB1 = XOR( XB1, T ); \
-   T = _mm_slli_epi32( TC, 7 );\
    XB1 = XOR( XB1, TB ); \
-   XC1 = XOR( XC1, T ); \
+   T = _mm_slli_epi32( TC, 7 );\
    TC = _mm_srli_epi32( TC, 25 );\
+   XC1 = XOR( XC1, T ); \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA0 ); \
@@ -426,14 +442,14 @@ do{ \
    T  = _mm_slli_epi32( TA, 9 ); \
    TA = _mm_srli_epi32( TA, 23 ); \
    XA2 = XOR( XA2, T ); \
+   XA2 = XOR( XA2, TA ); \
    T = _mm_slli_epi32( TB, 9 );\
    TB = _mm_srli_epi32( TB, 23 );\
-   XA2 = XOR( XA2, TA ); \
    XB2 = XOR( XB2, T ); \
-   T = _mm_slli_epi32( TC, 9 );\
    XB2 = XOR( XB2, TB ); \
-   XC2 = XOR( XC2, T ); \
+   T = _mm_slli_epi32( TC, 9 );\
    TC = _mm_srli_epi32( TC, 23 );\
+   XC2 = XOR( XC2, T ); \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA1 ); \
@@ -442,17 +458,17 @@ do{ \
    T  = _mm_slli_epi32( TA, 13); \
    TA = _mm_srli_epi32( TA, 19 ); \
    XA1 = ROL_1X32( XA1 ); \
-   XA3 = XOR( XA3, T ); \
    XB1 = ROL_1X32( XB1 ); \
+   XC1 = ROL_1X32( XC1 ); \
+   XA3 = XOR( XA3, T ); \
+   XA3 = XOR( XA3, TA ); \
    T  = _mm_slli_epi32( TB, 13); \
    TB = _mm_srli_epi32( TB, 19 ); \
-   XA3 = XOR( XA3, TA ); \
    XB3 = XOR( XB3, T ); \
+   XB3 = XOR( XB3, TB ); \
    T  = _mm_slli_epi32( TC, 13); \
    TC = _mm_srli_epi32( TC, 19 ); \
-   XB3 = XOR( XB3, TB ); \
    XC3 = XOR( XC3, T ); \
-   XC1 = ROL_1X32( XC1 ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA2 ); \
@@ -461,70 +477,94 @@ do{ \
    T  = _mm_slli_epi32( TA, 18 ); \
    TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
+   XB2 = SWAP_64( XB2 ); \
+   XC2 = SWAP_64( XC2 ); \
    XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
    T  = _mm_slli_epi32( TB, 18 ); \
-   XB2 = SWAP_64( XB2 ); \
    TB = _mm_srli_epi32( TB, 14 ); \
    XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
    T = _mm_slli_epi32( TC, 18 ); \
-   XA0 = XOR( XA0, TA ); \
    TC = _mm_srli_epi32( TC, 14 ); \
    XC0 = XOR( XC0, T ); \
-   XB0 = XOR( XB0, TB ); \
-   XC2 = SWAP_64( XC2 ); \
    XC0 = XOR( XC0, TC ); \
 \
    TA = ADD32( XA0, XA1 ); \
    TB = ADD32( XB0, XB1 ); \
    TC = ADD32( XC0, XC1 ); \
-   TA = ROL32( TA, 7 ); \
+   T = _mm_slli_epi32( TA, 7 ); \
+   TA = _mm_srli_epi32( TA, 25 ); \
    XA3 = ROR_1X32( XA3 ); \
+   XA3 = XOR( XA3, T ); \
    XA3 = XOR( XA3, TA ); \
-   TB = ROL32( TB, 7 ); \
+   T = _mm_slli_epi32( TB, 7 ); \
+   TB = _mm_srli_epi32( TB, 25 ); \
    XB3 = ROR_1X32( XB3 ); \
+   XB3 = XOR( XB3, T ); \
    XB3 = XOR( XB3, TB ); \
-   TC = ROL32( TC, 7 ); \
+   T = _mm_slli_epi32( TC, 7 ); \
+   TC = _mm_srli_epi32( TC, 25 ); \
    XC3 = ROR_1X32( XC3 ); \
+   XC3 = XOR( XC3, T ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA0 ); \
    TB = ADD32( XB3, XB0 ); \
    TC = ADD32( XC3, XC0 ); \
-   TA = ROL32( TA, 9 ); \
-   TB = ROL32( TB, 9 ); \
-   TC = ROL32( TC, 9 ); \
+   T = _mm_slli_epi32( TA, 9 ); \
+   TA = _mm_srli_epi32( TA, 23 ); \
+   XA2 = XOR( XA2, T ); \
    XA2 = XOR( XA2, TA ); \
+   T = _mm_slli_epi32( TB, 9 ); \
+   TB = _mm_srli_epi32( TB, 23 ); \
+   XB2 = XOR( XB2, T ); \
    XB2 = XOR( XB2, TB ); \
+   T = _mm_slli_epi32( TC, 9 ); \
+   TC = _mm_srli_epi32( TC, 23 ); \
+   XC2 = XOR( XC2, T ); \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA3 ); \
    TB = ADD32( XB2, XB3 ); \
-   TA = ROL32( TA, 13 ); \
    TC = ADD32( XC2, XC3 ); \
+   T = _mm_slli_epi32( TA, 13 ); \
+   TA = _mm_srli_epi32( TA, 19 ); \
    XA3 = ROL_1X32( XA3 ); \
-   TB = ROL32( TB, 13 ); \
    XB3 = ROL_1X32( XB3 ); \
-   XA1 = XOR( XA1, TA ); \
-   TC = ROL32( TC, 13 ); \
    XC3 = ROL_1X32( XC3 ); \
+   XA1 = XOR( XA1, T ); \
+   XA1 = XOR( XA1, TA ); \
+   T = _mm_slli_epi32( TB, 13 ); \
+   TB = _mm_srli_epi32( TB, 19 ); \
+   XB1 = XOR( XB1, T ); \
    XB1 = XOR( XB1, TB ); \
+   T = _mm_slli_epi32( TC, 13 ); \
+   TC = _mm_srli_epi32( TC, 19 ); \
+   XC1 = XOR( XC1, T ); \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA2 ); \
    TB = ADD32( XB1, XB2 ); \
-   TA = ROL32( TA, 18); \
    TC = ADD32( XC1, XC2 ); \
+   T = _mm_slli_epi32( TA, 18 ); \
+   TA = _mm_srli_epi32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
-   TB = ROL32( TB, 18); \
-   XA0 = XOR( XA0, TA ); \
    XB2 = SWAP_64( XB2 ); \
-   TC = ROL32( TC, 18); \
-   XB0 = XOR( XB0, TB ); \
+   XA0 = XOR( XA0, T ); \
+   XA0 = XOR( XA0, TA ); \
+   T = _mm_slli_epi32( TB, 18 ); \
+   TB = _mm_srli_epi32( TB, 14 ); \
    XC2 = SWAP_64( XC2 ); \
    XA1 = ROR_1X32( XA1 ); \
+   XB0 = XOR( XB0, T ); \
+   XB0 = XOR( XB0, TB ); \
+   T = _mm_slli_epi32( TC, 18 ); \
+   TC = _mm_srli_epi32( TC, 14 ); \
    XB1 = ROR_1X32( XB1 ); \
-   XC0 = XOR( XC0, TC ); \
    XC1 = ROR_1X32( XC1 ); \
+   XC0 = XOR( XC0, T ); \
+   XC0 = XOR( XC0, TC ); \
 } while (0);
 
 
@@ -614,6 +654,12 @@ do{ \
    SALSA_2ROUNDS_SIMD128_2BUF; \
    SALSA_2ROUNDS_SIMD128_2BUF;
 
+#define SALSA_8ROUNDS_SIMD128_2BUF_SLOROT \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_2BUF_SLOROT;
+
 #define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \
    SALSA_2ROUNDS_SIMD128_2BUF; \
    SALSA_2ROUNDS_SIMD128_2BUF; \
@@ -626,6 +672,12 @@ do{ \
    SALSA_2ROUNDS_SIMD128_3BUF; \
    SALSA_2ROUNDS_SIMD128_3BUF;
 
+#define SALSA_8ROUNDS_SIMD128_3BUF_SLOROT \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
+   SALSA_2ROUNDS_SIMD128_3BUF_SLOROT;
+
 #define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \
    SALSA_2ROUNDS_SIMD128_3BUF; \
    SALSA_2ROUNDS_SIMD128_3BUF; \
@@ -746,13 +798,13 @@ static void xor_salsa8_16way( __m512i * const B, const __m512i * const C)
 
 void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128*16 );
+      memcpy( &V[n * 32], X, 128*16 );
       xor_salsa8_16way( &X[ 0], &X[16] );
       xor_salsa8_16way( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m512_ovly *vptr[16];   // pointer to V offset for each lane 
       m512_ovly *x16 = (m512_ovly*)(&X[16]);
@@ -765,12 +817,12 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
          vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
       }
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
          m512_ovly v;    // V value assembled from different indexes
          for ( int l = 0; l < 8; l++ )
-            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
-         X[ k ] = _mm512_xor_si512( X[ k ], v.m512 );
+            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
+         X[i] = _mm512_xor_si512( X[i], v.m512 );
       }
 
       xor_salsa8_16way( &X[ 0], &X[16] );
@@ -852,14 +904,14 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 
 void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 4*128 );
+      memcpy( &V[n * 32], X, 4*128 );
       salsa8_simd128_4way( &X[ 0], &X[16] );
       salsa8_simd128_4way( &X[16], &X[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       uint32_t x16[4];   // index into V for each lane
       memcpy( x16, &X[16], 16 );
@@ -869,12 +921,12 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
       x16[3] = 32 * ( x16[3] & ( N-1) );
       m128_ovly *v = (m128_ovly*)V;
 
-      for( int k = 0; k < 32; k++ )
+      for( int i = 0; i < 32; i++ )
       {
-         X[k] = _mm_xor_si128( X[k], _mm_set_epi32( v[ x16[3] + k ].u32[3],
-                                                    v[ x16[2] + k ].u32[2],
-                                                    v[ x16[1] + k ].u32[1],
-                                                    v[ x16[0] + k ].u32[0] ) );
+         X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3],
+                                                    v[ x16[2] + i ].u32[2],
+                                                    v[ x16[1] + i ].u32[1],
+                                                    v[ x16[0] + i ].u32[0] ) );
       }
 
       salsa8_simd128_4way( &X[ 0], &X[16] );
@@ -882,49 +934,60 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
    }
 }
 
-// not working, occasional accepted shares, not up to date.
+// 4x memory usage
+// Working
 // 4x128 interleaving
-static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
+static void salsa_shuffle_4way_simd128( __m512i *X )
 {
-   __m512i X0, X1, X2, X3;
-   uint32_t *b = (uint32_t*)B;
-   m512_ovly y[4], z[4];
-
-   // mix C into B then shuffle B into X
-   B[0] = _mm512_xor_si512( B[0], C[0] );
-   B[1] = _mm512_xor_si512( B[1], C[1] );
-   B[2] = _mm512_xor_si512( B[2], C[2] );
-   B[3] = _mm512_xor_si512( B[3], C[3] );
+   __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
 
-   // { l3u15, l3u10, l3u5, l3u0,  l2u15, l2u10, l2u5, l2u0,
-   //   l1u15, l1u10, l1u5, l1u0,  l0u15, l0u10, l0u5, l0u0 }
+   Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
+   Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
 
-   //  b index = row index     + lane index + unit index
-   //          = ( 8 * (u/4) ) +  ( 4*l )   +  ( u%4 )
+   Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
+   Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
 
-   X0 = _mm512_set_epi32( b[63], b[46], b[29], b[12],   // lane 3[3:0]
-                          b[59], b[42], b[25], b[ 8],   // lane 2[3:0]
-                          b[55], b[38], b[21], b[ 4],   // lane 1[3:0]
-                          b[51], b[34], b[17], b[ 0] ); // lane 0[3:0]
+   Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
+   Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
 
-   X1 = _mm512_set_epi32( b[15], b[62], b[45], b[28], 
-                          b[11], b[58], b[41], b[24],  
-                          b[ 7], b[54], b[37], b[20],
-                          b[ 3], b[50], b[33], b[16] ); // lane 0[7:4]
+   Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
+   Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
 
-   X2 = _mm512_set_epi32( b[31], b[14], b[61], b[44],
-                          b[27], b[10], b[57], b[40],
-                          b[23], b[ 6], b[53], b[36],
-                          b[19], b[ 2], b[49], b[32] );
+   X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
+   X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
+   X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
+   X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
+}
 
-   X3 = _mm512_set_epi32( b[47], b[30], b[13], b[60],
-                          b[43], b[26], b[ 9], b[56],
-                          b[39], b[22], b[ 5], b[52],
-                          b[35], b[18], b[ 1], b[48] );
+static void salsa_unshuffle_4way_simd128( __m512i *X )
+{
+   __m512i Y0, Y1, Y2, Y3;
+
+   Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
+   Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
+   Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
+   Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
+
+   Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
+   Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
+   Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
+   Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
+
+   X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
+   X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
+   X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
+   X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
+}
 
+static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
+{
+   __m512i X0, X1, X2, X3;
 
+   X0 = B[0] = _mm512_xor_si512( B[0], C[0] );
+   X1 = B[1] = _mm512_xor_si512( B[1], C[1] );
+   X2 = B[2] = _mm512_xor_si512( B[2], C[2] );
+   X3 = B[3] = _mm512_xor_si512( B[3], C[3] );
 
-   // define targets for macros used in round function template
    #define ROL_1X32    mm512_shufll128_32  // shuffle within 128 bit lanes
    #define ROR_1X32    mm512_shuflr128_32
    #define SWAP_64     mm512_swap128_64
@@ -932,7 +995,7 @@ static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
    #define ADD32       _mm512_add_epi32
    #define XOR         _mm512_xor_si512
 
-   SALSA_8ROUNDS_FINAL_SIMD128;
+   SALSA_8ROUNDS_SIMD128;
 
    #undef ROL_1X32
    #undef ROR_1X32
@@ -941,123 +1004,25 @@ static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
    #undef ADD32
    #undef XOR 
 
-   y[0].m512 = X0;
-   y[1].m512 = X1;
-   y[2].m512 = X2;
-   y[3].m512 = X3;
-
-   // lane 0
-   z[0].u32[ 0    ] = y[0].u32[ 0];
-   z[0].u32[ 3    ] = y[1].u32[ 0];
-   z[0].u32[ 2    ] = y[2].u32[ 0];
-   z[0].u32[ 1    ] = y[3].u32[ 0];
-
-   // lane 1
-   z[0].u32[ 0+ 4 ] = y[0].u32[ 4];
-   z[0].u32[ 3+ 4 ] = y[1].u32[ 4];
-   z[0].u32[ 2+ 4 ] = y[2].u32[ 4];
-   z[0].u32[ 1+ 4 ] = y[3].u32[ 4];
-
-   // lane 2
-   z[0].u32[ 0+ 8 ] = y[0].u32[ 8];
-   z[0].u32[ 3+ 8 ] = y[1].u32[ 8];
-   z[0].u32[ 2+ 8 ] = y[2].u32[ 8];
-   z[0].u32[ 1+ 8 ] = y[3].u32[ 8];
-   
-   // lane 3
-   z[0].u32[ 0+12 ] = y[0].u32[12];
-   z[0].u32[ 3+12 ] = y[1].u32[12];
-   z[0].u32[ 2+12 ] = y[2].u32[12];
-   z[0].u32[ 1+12 ] = y[3].u32[12];
-
-   // lane 0
-   z[1].u32[ 1    ] = y[0].u32[ 1];
-   z[1].u32[ 0    ] = y[1].u32[ 1];
-   z[1].u32[ 3    ] = y[2].u32[ 1];
-   z[1].u32[ 2    ] = y[3].u32[ 1];
-
-   //lane 1
-   z[1].u32[ 1+ 4 ] = y[0].u32[ 5];
-   z[1].u32[ 0+ 4 ] = y[1].u32[ 5];
-   z[1].u32[ 3+ 4 ] = y[2].u32[ 5];
-   z[1].u32[ 2+ 4 ] = y[3].u32[ 5];
-
-   // lane 2
-   z[1].u32[ 1+ 8 ] = y[0].u32[ 9];
-   z[1].u32[ 0+ 8 ] = y[1].u32[ 9];
-   z[1].u32[ 3+ 8 ] = y[2].u32[ 9];
-   z[1].u32[ 2+ 8 ] = y[3].u32[ 9];
-
-   // lane 3
-   z[1].u32[ 1+12 ] = y[0].u32[13];
-   z[1].u32[ 0+12 ] = y[1].u32[13];
-   z[1].u32[ 3+12 ] = y[2].u32[13];
-   z[1].u32[ 2+12 ] = y[3].u32[13];
-  
-   // lane 0
-   z[2].u32[ 2    ] = y[0].u32[2];
-   z[2].u32[ 1    ] = y[1].u32[2];
-   z[2].u32[ 0    ] = y[2].u32[2];
-   z[2].u32[ 3    ] = y[3].u32[2];
-
-   // lane 1
-   z[2].u32[ 2+ 4 ] = y[0].u32[6];
-   z[2].u32[ 1+ 4 ] = y[1].u32[6];
-   z[2].u32[ 0+ 4 ] = y[2].u32[6];
-   z[2].u32[ 3+ 4 ] = y[3].u32[6];
-
-   // lane 2
-   z[2].u32[ 2+ 8 ] = y[0].u32[10];
-   z[2].u32[ 1+ 8 ] = y[1].u32[10];
-   z[2].u32[ 0+ 8 ] = y[2].u32[10];
-   z[2].u32[ 3+ 8 ] = y[3].u32[10];
-
-   // lane 3
-   z[2].u32[ 2+12 ] = y[0].u32[14];
-   z[2].u32[ 1+12 ] = y[1].u32[14];
-   z[2].u32[ 0+12 ] = y[2].u32[14];
-   z[2].u32[ 3+12 ] = y[3].u32[14];
-   
-   // lane 0
-   z[3].u32[ 3    ] = y[0].u32[ 3];
-   z[3].u32[ 2    ] = y[1].u32[ 3];
-   z[3].u32[ 1    ] = y[2].u32[ 3];
-   z[3].u32[ 0    ] = y[3].u32[ 3];
-
-   // lane 1
-   z[3].u32[ 3+ 4 ] = y[0].u32[ 7];
-   z[3].u32[ 2+ 4 ] = y[1].u32[ 7];
-   z[3].u32[ 1+ 4 ] = y[2].u32[ 7];
-   z[3].u32[ 0+ 4 ] = y[3].u32[ 7];
-
-   // lane 2
-   z[3].u32[ 3+ 8 ] = y[0].u32[11];
-   z[3].u32[ 2+ 8 ] = y[1].u32[11];
-   z[3].u32[ 1+ 8 ] = y[2].u32[11];
-   z[3].u32[ 0+ 8 ] = y[3].u32[11];
-
-   // lane 1
-   z[3].u32[ 3+12 ] = y[0].u32[15];
-   z[3].u32[ 2+12 ] = y[1].u32[15];
-   z[3].u32[ 1+12 ] = y[2].u32[15];
-   z[3].u32[ 0+12 ] = y[3].u32[15];
-
-   B[0] = _mm512_add_epi32( B[0], z[0].m512 );
-   B[1] = _mm512_add_epi32( B[1], z[1].m512 );
-   B[2] = _mm512_add_epi32( B[2], z[2].m512 );
-   B[3] = _mm512_add_epi32( B[3], z[3].m512 );
+   B[0] = _mm512_add_epi32( B[0], X0 );
+   B[1] = _mm512_add_epi32( B[1], X1 );
+   B[2] = _mm512_add_epi32( B[2], X2 );
+   B[3] = _mm512_add_epi32( B[3], X3 );
 }
 
 void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   salsa_shuffle_4way_simd128( X );
+   salsa_shuffle_4way_simd128( X+4 );
+   
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 8], X, 128*4 );
+      memcpy( &V[n * 8], X, 128*4 );
       salsa8_4way_simd128( &X[0], &X[4] );
       salsa8_4way_simd128( &X[4], &X[0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m512_ovly x16;
       x16 = ( (m512_ovly*)X )[4];
@@ -1066,25 +1031,22 @@ void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
       uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) );
       uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
-         X[k] = _mm512_xor_si512( X[k], m512_const_128( 
-                                   ( (m512_ovly*)V )[ j3+k ].m128[3],
-                                   ( (m512_ovly*)V )[ j2+k ].m128[2],
-                                   ( (m512_ovly*)V )[ j1+k ].m128[1],
-                                   ( (m512_ovly*)V )[ j0+k ].m128[0] ) );
+      for ( int i = 0; i < 8; i++ )
+      { 
+         __m512i v10 = _mm512_mask_blend_epi32( 0x000f, V[ j1+i ], V[ j0+i ] );
+         __m512i v32 = _mm512_mask_blend_epi32( 0x0f00, V[ j3+i ], V[ j2+i ] );
+         X[i] = _mm512_xor_si512( X[i], _mm512_mask_blend_epi32( 0x00ff,
+                                                                 v32, v10 ) );
+      }
 
-/*
-      for ( int k = 0; k < 8; k++ )
-         X[k] = _mm512_xor_si512( X[k], m512_diagonal128_32( 
-                   V[ j3+k ], V[ j2+k ], V[ j1+k ], V[ j0+k ] ) );
-*/
       salsa8_4way_simd128( &X[0], &X[4] );
       salsa8_4way_simd128( &X[4], &X[0] );
    }
+
+   salsa_unshuffle_4way_simd128( X );
+   salsa_unshuffle_4way_simd128( X+4 );
 }
    
-
-
 #endif // AVX512
 
 #if defined(__AVX2__)
@@ -1142,14 +1104,14 @@ static void salsa8_8way( __m256i * const B, const __m256i * const C )
 
 void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128*8 );
+      memcpy( &V[n * 32], X, 128*8 );
       salsa8_8way( &X[ 0], &X[16] );
       salsa8_8way( &X[16], &X[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m256_ovly *vptr[8];   // pointer to V offset for each lane 
       m256_ovly *x16 = (m256_ovly*)(&X[16]);
@@ -1162,12 +1124,12 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
          vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
       }
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
          m256_ovly v;    // V value assembled from different indexes
          for ( int l = 0; l < 8; l++ )
-            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
-         X[ k ] = _mm256_xor_si256( X[ k ], v.m256 );
+            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
+         X[i] = _mm256_xor_si256( X[i], v.m256 );
       }
 
       salsa8_8way( &X[ 0], &X[16] );
@@ -1176,7 +1138,7 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 }
 
 // 2x memory usage
-// Working, not up to date, needs stream optimization.
+// Working
 // Essentially Pooler 6way
 // 2x128 interleaved simd128
 //   ------- lane 1 -------    ------- lane 0 -------
@@ -1185,31 +1147,56 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
 // { l1xb, l1xa, l1c9, l1x8,   l0xb, l0xa, l0x9, l0x8 }   b[1]  B[23:16]
 // { l1xf, l1xe, l1xd, l1xc,   l0xf, l0xe, l0xd, l0xc }   b[0]  B[31:24]
 
-static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
+static void salsa_shuffle_2way_simd128( __m256i *X )
 {
-   __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+   __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
 
-   // mix C into B then shuffle B into X
-   B[0] = _mm256_xor_si256( B[0], C[0] );
-   B[1] = _mm256_xor_si256( B[1], C[1] );
-   B[2] = _mm256_xor_si256( B[2], C[2] );
-   B[3] = _mm256_xor_si256( B[3], C[3] );
+   Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
+   Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
+
+   Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
+   Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
+
+   Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
+   Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
+
+   Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
+   Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
 
-   Y0 = _mm256_blend_epi32( B[1], B[0], 0x11 );
-   X0 = _mm256_blend_epi32( B[3], B[2], 0x44 );
-   X0 = _mm256_blend_epi32( X0, Y0, 0x33);
+   X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
+   X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
+   X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
+   X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
+}
 
-   Y1 = _mm256_blend_epi32( B[2], B[1], 0x11 );
-   X1 = _mm256_blend_epi32( B[0], B[3], 0x44 );
-   X1 = _mm256_blend_epi32( X1, Y1, 0x33 );
+static void salsa_unshuffle_2way_simd128( __m256i *X )
+{
+   __m256i Y0, Y1, Y2, Y3;
+
+   Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
+   Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
+   Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
+   Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
+
+   Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
+   Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
+   Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
+   Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
+
+   X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
+   X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
+   X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
+   X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
+}
 
-   Y2 = _mm256_blend_epi32( B[3], B[2], 0x11 );
-   X2 = _mm256_blend_epi32( B[1], B[0], 0x44 );
-   X2 = _mm256_blend_epi32( X2, Y2, 0x33 );
+static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
+{
+   __m256i X0, X1, X2, X3;
 
-   Y3 = _mm256_blend_epi32( B[0], B[3], 0x11 );
-   X3 = _mm256_blend_epi32( B[2], B[1], 0x44 );
-   X3 = _mm256_blend_epi32( X3, Y3, 0x33 );
+   X0 = B[0] = _mm256_xor_si256( B[0], C[0] );
+   X1 = B[1] = _mm256_xor_si256( B[1], C[1] );
+   X2 = B[2] = _mm256_xor_si256( B[2], C[2] );
+   X3 = B[3] = _mm256_xor_si256( B[3], C[3] );
 
    // define targets for macros used in round function template
    #define ROL_1X32    mm256_shufll128_32  // shuffle within 128 bit lanes
@@ -1228,52 +1215,41 @@ static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
    #undef ADD32
    #undef XOR 
 
-   // init with X0 then blend in the other elements
+   B[0] = _mm256_add_epi32( B[0], X0 );
+   B[1] = _mm256_add_epi32( B[1], X1 );
+   B[2] = _mm256_add_epi32( B[2], X2 );
+   B[3] = _mm256_add_epi32( B[3], X3 );
+}
 
-   Y0 = _mm256_blend_epi32( X0, X1, 0x88 );
-   Y1 = _mm256_blend_epi32( X0, X1, 0x11 );   
-   Y2 = _mm256_blend_epi32( X0, X1, 0x22 );   
-   Y3 = _mm256_blend_epi32( X0, X1, 0x44 );     
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
+{
+   salsa_shuffle_2way_simd128( X );
+   salsa_shuffle_2way_simd128( X+4 );
 
-   Y0 = _mm256_blend_epi32( Y0, X2, 0x44 );
-   Y1 = _mm256_blend_epi32( Y1, X2, 0x88 );
-   Y2 = _mm256_blend_epi32( Y2, X2, 0x11 );
-   Y3 = _mm256_blend_epi32( Y3, X2, 0x22 );
-   
-   Y0 = _mm256_blend_epi32( Y0, X3, 0x22 );
-   Y1 = _mm256_blend_epi32( Y1, X3, 0x44 );
-   Y2 = _mm256_blend_epi32( Y2, X3, 0x88 );
-   Y3 = _mm256_blend_epi32( Y3, X3, 0x11 );
-   
-   B[0] = _mm256_add_epi32( B[0], Y0 );
-   B[1] = _mm256_add_epi32( B[1], Y1 );
-   B[2] = _mm256_add_epi32( B[2], Y2 );
-   B[3] = _mm256_add_epi32( B[3], Y3 );
-}
-
-void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
-{
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 8], X, 128*2 );
+      memcpy( &V[n * 8], X, 128*2 );
       salsa8_2way_simd128( &X[0], &X[4] );
       salsa8_2way_simd128( &X[4], &X[0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m256_ovly x16;
       x16 = ( (m256_ovly*)X )[4];
       uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) );
       uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
-         X[k] = _mm256_xor_si256( X[k], _mm256_blend_epi32( V[ j1+k ],
-                                                            V[ j0+k ], 0x0f ) );
+      for ( int i = 0; i < 8; i++ )
+         X[i] = _mm256_xor_si256( X[i], _mm256_blend_epi32( V[ j1+i ],
+                                                            V[ j0+i ], 0x0f ) );
 
       salsa8_2way_simd128( &X[0], &X[4] );
       salsa8_2way_simd128( &X[4], &X[0] );
    }
+
+   salsa_unshuffle_2way_simd128( X );
+   salsa_unshuffle_2way_simd128( X+4 );
 }
 
 // Working
@@ -1386,17 +1362,17 @@ void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
    __m256i *V0 = V;
    __m256i *V1 = V + 8*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         _mm256_stream_si256( V0 + i*8 + k, X0[k] );   
-         _mm256_stream_si256( V1 + i*8 + k, X1[k] );      
+         _mm256_stream_si256( V0 + n*8 + i, X0[i] );   
+         _mm256_stream_si256( V1 + n*8 + i, X1[i] );      
       }
       salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
       salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       const m256_ovly x16a = ( (m256_ovly*)X0 )[4];
       const m256_ovly x16b = ( (m256_ovly*)X1 )[4];
@@ -1406,25 +1382,16 @@ void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
       const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
       const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + k );
-         const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + k );
-         const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + k );
-         const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + k );
-         X0[k] = _mm256_xor_si256( X0[k],
+         const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + i );
+         const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + i );
+         const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + i );
+         const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + i );
+         X0[i] = _mm256_xor_si256( X0[i],
                        _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) );
-         X1[k] = _mm256_xor_si256( X1[k],
+         X1[i] = _mm256_xor_si256( X1[i],
                        _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) );
-
-
-/*
-         X0[k] = _mm256_xor_si256( X0[k],
-                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
-         X1[k] = _mm256_xor_si256( X1[k],
-                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
-*/
-
       }
 
       salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
@@ -1577,17 +1544,17 @@ void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
    __m256i *V1 = V + 8*N;
    __m256i *V2 = V + 16*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V0[i * 8], X0, 128*2 );
-      memcpy( &V1[i * 8], X1, 128*2 );
-      memcpy( &V2[i * 8], X2, 128*2 );
+      memcpy( &V0[n * 8], X0, 128*2 );
+      memcpy( &V1[n * 8], X1, 128*2 );
+      memcpy( &V2[n * 8], X2, 128*2 );
       salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0],
                                 &X0[4], &X1[4], &X2[4] );
       salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],
                                 &X0[0], &X1[0], &X2[0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m256_ovly x16a, x16b, x16c;
       x16a = ( (m256_ovly*)X0 )[4];
@@ -1601,14 +1568,14 @@ void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
       uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
       uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) );
 
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         X0[k] = _mm256_xor_si256( X0[k],
-                       _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) );
-         X1[k] = _mm256_xor_si256( X1[k],
-                       _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) );
-         X2[k] = _mm256_xor_si256( X2[k],
-                       _mm256_blend_epi32( V2[ j1c+k ], V2[ j0c+k ], 0x0f ) );
+         X0[i] = _mm256_xor_si256( X0[i],
+                       _mm256_blend_epi32( V0[ j1a+i ], V0[ j0a+i ], 0x0f ) );
+         X1[i] = _mm256_xor_si256( X1[i],
+                       _mm256_blend_epi32( V1[ j1b+i ], V1[ j0b+i ], 0x0f ) );
+         X2[i] = _mm256_xor_si256( X2[i],
+                       _mm256_blend_epi32( V2[ j1c+i ], V2[ j0c+i ], 0x0f ) );
       }
 
       salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], 
@@ -1707,23 +1674,23 @@ static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c )
 
 void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
-         _mm256_stream_si256( (__m256i*)V + i*8 + k, casti_m256i( X, k ) );
+      for ( int i = 0; i < 8; i++ )
+         _mm256_stream_si256( (__m256i*)V + n*8 + i, casti_m256i( X, i ) );
       salsa8_simd128_2way( &X[ 0], &X[16] );
       salsa8_simd128_2way( &X[16], &X[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       // need 2 J's
       const uint32_t j0 = 32 * ( (uint32_t)( X[16]       ) & ( N-1 ) );
       const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) );
 
-      for ( int k = 0; k < 32; k++ )
-         X[k] ^= ( ( V[ j1 + k ] & 0xffffffff00000000 )
-                 | ( V[ j0 + k ] & 0x00000000ffffffff ) );  
+      for ( int i = 0; i < 32; i++ )
+         X[i] ^= ( ( V[ j1 + i ] & 0xffffffff00000000 )
+                 | ( V[ j0 + i ] & 0x00000000ffffffff ) );  
 
       salsa8_simd128_2way( &X[ 0], &X[16] );
       salsa8_simd128_2way( &X[16], &X[ 0] );
@@ -1845,18 +1812,18 @@ void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
    uint64_t *V0 = V;
    uint64_t *V1 = V + 32*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         _mm256_stream_si256( (__m256i*)V0 + i*8 + k, casti_m256i( X0, k ) );
-         _mm256_stream_si256( (__m256i*)V1 + i*8 + k, casti_m256i( X1, k ) );
+         _mm256_stream_si256( (__m256i*)V0 + n*8 + i, casti_m256i( X0, i ) );
+         _mm256_stream_si256( (__m256i*)V1 + n*8 + i, casti_m256i( X1, i ) );
       }
       salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
       salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       // need 4 J's
       const uint32_t j0l = 32 * ( (const uint32_t)( X0[16]       ) & ( N-1 ) );
@@ -1864,12 +1831,12 @@ void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
       const uint32_t j1l = 32 * ( (const uint32_t)( X1[16]       ) & ( N-1 ) );
       const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
          
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
-         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
-                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
-         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
-                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
+         X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + i ] & 0x00000000ffffffff ) );
+         X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + i ] & 0x00000000ffffffff ) );
       }
       salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
       salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
@@ -2025,18 +1992,18 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
    uint64_t *V1 = V + 32*N;
    uint64_t *V2 = V + 64*N;
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V0[i * 32], X0, 2*128 );
-      memcpy( &V1[i * 32], X1, 2*128 );
-      memcpy( &V2[i * 32], X2, 2*128 );
+      memcpy( &V0[ n*32 ], X0, 2*128 );
+      memcpy( &V1[ n*32 ], X1, 2*128 );
+      memcpy( &V2[ n*32 ], X2, 2*128 );
       salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
                                 &X0[16], &X1[16], &X2[16] );
       salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
                                 &X0[ 0], &X1[ 0], &X2[ 0] );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
       uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
@@ -2045,14 +2012,14 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
       uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
       uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
-         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
-                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
-         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
-                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
-         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
-                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
+         X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 )
+                  | ( V0[ j0l + i ] & 0x00000000ffffffff ) );
+         X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 )
+                  | ( V1[ j1l + i ] & 0x00000000ffffffff ) );
+         X2[i] ^= ( ( V2[ j2h + i ] & 0xffffffff00000000 )
+                  | ( V2[ j2l + i ] & 0x00000000ffffffff ) );
       }
       salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
                                 &X0[16], &X1[16], &X2[16] );
@@ -2061,229 +2028,6 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
    }
 }
 
-// Working, deprecated
-// 8x memory usage
-// 2x32 interleaving
-static void salsa8_simd128_2way_4buf( uint64_t *BA, uint64_t *BB,
-          uint64_t *BC, uint64_t *BD, const uint64_t *CA, const uint64_t *CB,
-          const uint64_t *CC, const uint64_t *CD )
-{
-   __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
-   __m256i *ba = (__m256i*)BA;
-   __m256i *bb = (__m256i*)BB;
-   __m256i *bc = (__m256i*)BC;
-   __m256i *bd = (__m256i*)BD;
-   const __m256i *ca = (const __m256i*)CA;
-   const __m256i *cb = (const __m256i*)CB;
-   const __m256i *cc = (const __m256i*)CC;
-   const __m256i *cd = (const __m256i*)CD;
-   m256_ovly ya[4], yb[4], yc[4], yd[4],
-             za[4], zb[4], zc[4], zd[4];
-
-   // mix C into B then shuffle B into X
-   ba[0] = _mm256_xor_si256( ba[0], ca[0] );
-   bb[0] = _mm256_xor_si256( bb[0], cb[0] );
-   bc[0] = _mm256_xor_si256( bc[0], cc[0] );
-   bd[0] = _mm256_xor_si256( bd[0], cd[0] );
-   ba[1] = _mm256_xor_si256( ba[1], ca[1] );
-   bb[1] = _mm256_xor_si256( bb[1], cb[1] );
-   bc[1] = _mm256_xor_si256( bc[1], cc[1] );
-   bd[1] = _mm256_xor_si256( bd[1], cd[1] );
-   ba[2] = _mm256_xor_si256( ba[2], ca[2] );
-   bb[2] = _mm256_xor_si256( bb[2], cb[2] );
-   bc[2] = _mm256_xor_si256( bc[2], cc[2] );
-   bd[2] = _mm256_xor_si256( bd[2], cd[2] );
-   ba[3] = _mm256_xor_si256( ba[3], ca[3] );
-   bb[3] = _mm256_xor_si256( bb[3], cb[3] );
-   bc[3] = _mm256_xor_si256( bc[3], cc[3] );
-   bd[3] = _mm256_xor_si256( bd[3], cd[3] );
-
-   XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
-   XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
-   XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
-   XD0 = _mm256_set_epi64x( BD[15], BD[10], BD[ 5], BD[ 0] );
-   XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
-   XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
-   XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
-   XD1 = _mm256_set_epi64x( BD[ 3], BD[14], BD[ 9], BD[ 4] );
-   XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
-   XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
-   XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
-   XD2 = _mm256_set_epi64x( BD[ 7], BD[ 2], BD[13], BD[ 8] );
-   XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
-   XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
-   XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
-   XD3 = _mm256_set_epi64x( BD[11], BD[ 6], BD[ 1], BD[12] );
-
-   // define targets for macros used in round function template
-   #define ROL_1X32    mm256_shufll_64
-   #define ROR_1X32    mm256_shuflr_64
-   #define SWAP_64     mm256_swap_128
-   #define ROL32       mm256_rol_32
-   #define ADD32       _mm256_add_epi32
-   #define XOR         _mm256_xor_si256
-   #define TYPE        __m256i
-
-   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
-
-   #undef ROL_1X32
-   #undef ROR_1X32
-   #undef SWAP_64
-   #undef ROL32
-   #undef ADD32
-   #undef XOR 
-   #undef TYPE
-
-   ya[0].m256 = XA0;    yb[0].m256 = XB0;
-   yc[0].m256 = XC0;    yd[0].m256 = XD0;
-   ya[1].m256 = XA1;    yb[1].m256 = XB1;
-   yc[1].m256 = XC1;    yd[1].m256 = XD1;
-   ya[2].m256 = XA2;    yb[2].m256 = XB2;
-   yc[2].m256 = XC2;    yd[2].m256 = XD2;
-   ya[3].m256 = XA3;    yb[3].m256 = XB3;
-   yc[3].m256 = XC3;    yd[3].m256 = XD3;
-
-   za[0].u64[0] = ya[0].u64[0];
-   zb[0].u64[0] = yb[0].u64[0];
-   zc[0].u64[0] = yc[0].u64[0];
-   zd[0].u64[0] = yd[0].u64[0];
-   za[0].u64[3] = ya[1].u64[0];
-   zb[0].u64[3] = yb[1].u64[0];
-   zc[0].u64[3] = yc[1].u64[0];
-   zd[0].u64[3] = yd[1].u64[0];
-   za[0].u64[2] = ya[2].u64[0];
-   zb[0].u64[2] = yb[2].u64[0];
-   zc[0].u64[2] = yc[2].u64[0];
-   zd[0].u64[2] = yd[2].u64[0];
-   za[0].u64[1] = ya[3].u64[0];
-   zb[0].u64[1] = yb[3].u64[0];
-   zc[0].u64[1] = yc[3].u64[0];
-   zd[0].u64[1] = yd[3].u64[0];
-
-   za[1].u64[1] = ya[0].u64[1];
-   zb[1].u64[1] = yb[0].u64[1];
-   zc[1].u64[1] = yc[0].u64[1];
-   zd[1].u64[1] = yd[0].u64[1];
-   za[1].u64[0] = ya[1].u64[1];
-   zb[1].u64[0] = yb[1].u64[1];
-   zc[1].u64[0] = yc[1].u64[1];
-   zd[1].u64[0] = yd[1].u64[1];
-   za[1].u64[3] = ya[2].u64[1];
-   zb[1].u64[3] = yb[2].u64[1];
-   zc[1].u64[3] = yc[2].u64[1];
-   zd[1].u64[3] = yd[2].u64[1];
-   za[1].u64[2] = ya[3].u64[1];
-   zb[1].u64[2] = yb[3].u64[1];
-   zc[1].u64[2] = yc[3].u64[1];
-   zd[1].u64[2] = yd[3].u64[1];
-
-   za[2].u64[2] = ya[0].u64[2];
-   zb[2].u64[2] = yb[0].u64[2];
-   zc[2].u64[2] = yc[0].u64[2];
-   zd[2].u64[2] = yd[0].u64[2];
-   za[2].u64[1] = ya[1].u64[2];
-   zb[2].u64[1] = yb[1].u64[2];
-   zc[2].u64[1] = yc[1].u64[2];
-   zd[2].u64[1] = yd[1].u64[2];
-   za[2].u64[0] = ya[2].u64[2];
-   zb[2].u64[0] = yb[2].u64[2];
-   zc[2].u64[0] = yc[2].u64[2];
-   zd[2].u64[0] = yd[2].u64[2];
-   za[2].u64[3] = ya[3].u64[2];
-   zb[2].u64[3] = yb[3].u64[2];
-   zc[2].u64[3] = yc[3].u64[2];
-   zd[2].u64[3] = yd[3].u64[2];
-
-   za[3].u64[3] = ya[0].u64[3];
-   zb[3].u64[3] = yb[0].u64[3];
-   zc[3].u64[3] = yc[0].u64[3];
-   zd[3].u64[3] = yd[0].u64[3];
-   za[3].u64[2] = ya[1].u64[3];
-   zb[3].u64[2] = yb[1].u64[3];
-   zc[3].u64[2] = yc[1].u64[3];
-   zd[3].u64[2] = yd[1].u64[3];
-   za[3].u64[1] = ya[2].u64[3];
-   zb[3].u64[1] = yb[2].u64[3];
-   zc[3].u64[1] = yc[2].u64[3];
-   zd[3].u64[1] = yd[2].u64[3];
-   za[3].u64[0] = ya[3].u64[3];
-   zb[3].u64[0] = yb[3].u64[3];
-   zc[3].u64[0] = yc[3].u64[3];
-   zd[3].u64[0] = yd[3].u64[3];
-
-   ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
-   bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
-   bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
-   bd[0] = _mm256_add_epi32( bd[0], zd[0].m256 );
-   ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
-   bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
-   bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
-   bd[1] = _mm256_add_epi32( bd[1], zd[1].m256 );
-   ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
-   bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
-   bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
-   bd[2] = _mm256_add_epi32( bd[2], zd[2].m256 );
-   ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
-   bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
-   bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
-   bd[3] = _mm256_add_epi32( bd[3], zd[3].m256 );
-}
-
-void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )
-
-{
-   uint64_t *X0 = X;
-   uint64_t *X1 = X+32;
-   uint64_t *X2 = X+64;
-   uint64_t *X3 = X+96;
-   uint64_t *V0 = V;
-   uint64_t *V1 = V + 32*N;
-   uint64_t *V2 = V + 64*N;
-   uint64_t *V3 = V + 96*N;
-
-   for ( int i = 0; i < N; i++ )
-   {
-      memcpy( &V0[i * 32], X0, 2*128 );
-      memcpy( &V1[i * 32], X1, 2*128 );
-      memcpy( &V2[i * 32], X2, 2*128 );
-      memcpy( &V3[i * 32], X3, 2*128 );
-      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                                &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
-   }
-
-   for ( int i = 0; i < N; i++ )
-   {
-      // need 4 J's
-      uint32_t j0l = 32 * ( (uint32_t)( X0[16]       ) & ( N-1 ) );
-      uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
-      uint32_t j1l = 32 * ( (uint32_t)( X1[16]       ) & ( N-1 ) );
-      uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
-      uint32_t j2l = 32 * ( (uint32_t)( X2[16]       ) & ( N-1 ) );
-      uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
-      uint32_t j3l = 32 * ( (uint32_t)( X3[16]       ) & ( N-1 ) );
-      uint32_t j3h = 32 * ( (uint32_t)( X3[16] >> 32 ) & ( N-1 ) );
-
-      for ( int k = 0; k < 32; k++ )
-      {
-         X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 )
-                  | ( V0[ j0l + k ] & 0x00000000ffffffff ) );
-         X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 )
-                  | ( V1[ j1l + k ] & 0x00000000ffffffff ) );
-         X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 )
-                  | ( V2[ j2l + k ] & 0x00000000ffffffff ) );
-         X3[k] ^= ( ( V3[ j3h + k ] & 0xffffffff00000000 )
-                  | ( V3[ j3l + k ] & 0x00000000ffffffff ) );
-      }
-      salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                                &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                                &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
-   }
-}
-   
 
 #endif  // AVX2
 
@@ -2344,13 +2088,13 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
 
 void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128*4 );
+      memcpy( &V[ n*32 ], X, 128*4 );
       xor_salsa8_4way( &X[ 0], &X[16] );
       xor_salsa8_4way( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       m128_ovly *vptr[4]; 
       m128_ovly *x16 = (m128_ovly*)(&X[16]);
@@ -2361,12 +2105,12 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
          vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); 
       }
 
-      for ( int k = 0; k < 32; k++ )
+      for ( int i = 0; i < 32; i++ )
       {
          m128_ovly v;    
          for ( int l = 0; l < 4; l++ )
-            v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l];
-         X[ k ] = _mm_xor_si128( X[ k ], v.m128 );
+            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
+         X[i] = _mm_xor_si128( X[i], v.m128 );
       }
 
       xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2546,19 +2290,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
 
 void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
-         _mm_stream_si128( (__m128i*)V + i*8 + k, casti_m128i( X, k ) );
+      for ( int i = 0; i < 8; i++ )
+         _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) );
 
       salsa8_simd128( &X[ 0], &X[16] );
       salsa8_simd128( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       const int j = 32 * ( X[16] & ( N - 1 ) );
-      for ( int k = 0; k < 32; k++ )
-         X[k] ^= V[j + k];
+      for ( int i = 0; i < 32; i++ )
+         X[i] ^= V[ j+i ];
       salsa8_simd128( &X[ 0], &X[16] );
       salsa8_simd128( &X[16], &X[ 0] );
    }
@@ -2566,253 +2310,290 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 
 // Double buffered, 2x memory usage
 // No interleaving
-static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
-                       const uint32_t * const ca, const uint32_t * const cb )
-{
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
-
-   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
 
-   // mix C into B then shuffle B into X
-   BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   BB[3] = _mm_xor_si128( BB[3], CB[3] );
+static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
+{
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
 
 #if defined(__SSE4_1__)
 
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+//   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
 
 #if defined(__AVX2__)
-   
-   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
-   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
-   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
-   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
-   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
-   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
-   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
-   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
-   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
-   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
-   
-   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
-   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
-   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
-   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
-   
-   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
-   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
 
-   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
-   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
+   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
+   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
+   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
+   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
 
-   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
-   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
+   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
+   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
+   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
 
-   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
-   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
+   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
+   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
+   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
 
-#else // SSE4_1
+   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
+   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
+   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
+   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
+
+   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
+   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
+
+   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
+   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
+
+   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
+   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
+
+   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
+   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
+
+#else
 
-   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
-   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
-   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
-   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
+//  SSE4.1
 
-   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
-   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
-   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
-   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
+   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
+   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
+   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
+   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
 
-   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
-   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
-   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
-   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
+   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
+   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
+   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
 
-   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
-   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
-   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
-   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
+   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
+   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
+   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
 
-   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
-   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
+   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
+   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
+   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
+   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
 
-   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
-   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
+   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
+   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
 
-   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
-   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
+   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
+   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
 
-   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
-   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
+   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
+   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
+
+   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
+   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
 
 #endif  // AVX2 else SSE4_1
 
-   SALSA_8ROUNDS_SIMD128_2BUF;
+#else   // SSE2
+  
+   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
+
+   XA[0] = YA0;
+   XB[0] = YB0;
+   XA[1] = YA1;
+   XB[1] = YB1;
+   XA[2] = YA2;
+   XB[2] = YB2;
+   XA[3] = YA3;
+   XB[3] = YB3;
+
+#endif
+}
+
+static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
+{
+
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
 
 #if defined(__AVX2__)
 
-   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
-   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
-   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
-   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
-   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
-   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
-   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
-   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
-
-   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
-   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
-   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
-   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
-   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
-   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
-   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
-   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
+   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
+   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
+   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
+   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
+   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
+   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
+   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
+
+   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
+   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
+   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
+   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
+   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
+   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
+   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
+   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
 
-#else  // SSE4_1
+#else   // SSE4_1
 
-   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
-   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
-   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
-   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
-   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
-   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
-   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
-   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
-
-   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
-   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
-   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
-   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
-   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
-   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
-   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
-   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
+   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
+   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
+   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
+   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
+   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
+   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
+   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
+
+   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
+   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
+   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
+   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
+   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
+   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
+   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
+   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
 
-#endif // AVX2 else SSE4_1
-   
-   BA[0] = _mm_add_epi32( BA[0], YA0 );
-   BB[0] = _mm_add_epi32( BB[0], YB0 );
-   BA[1] = _mm_add_epi32( BA[1], YA1 );
-   BB[1] = _mm_add_epi32( BB[1], YB1 );
-   BA[2] = _mm_add_epi32( BA[2], YA2 );
-   BB[2] = _mm_add_epi32( BB[2], YB2 );
-   BA[3] = _mm_add_epi32( BA[3], YA3 );
-   BB[3] = _mm_add_epi32( BB[3], YB3 );
+#endif  // AVX2 else SSE4_1
 
 #else  // SSE2
 
    m128_ovly ya[4], za[4], yb[4], zb[4];
 
-   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
-   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
-   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
-   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
-   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
-   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
-   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
-   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
-   
-   SALSA_8ROUNDS_FINAL_SIMD128_2BUF;
-
-   // Final round doesn't shuffle data back to original input order,
-   // process it as is.
-
-   ya[0].m128 = XA0;
-   yb[0].m128 = XB0;
-   ya[1].m128 = XA1;
-   yb[1].m128 = XB1;
-   ya[2].m128 = XA2;
-   yb[2].m128 = XB2;
-   ya[3].m128 = XA3;
-   yb[3].m128 = XB3;
+   ya[0].m128 = XA[0];
+   yb[0].m128 = XB[0];
+   ya[1].m128 = XA[1];
+   yb[1].m128 = XB[1];
+   ya[2].m128 = XA[2];
+   yb[2].m128 = XB[2];
+   ya[3].m128 = XA[3];
+   yb[3].m128 = XB[3];
 
    za[0].u32[0] = ya[0].u32[0];
    zb[0].u32[0] = yb[0].u32[0];
-   za[0].u32[3] = ya[1].u32[0];
-   zb[0].u32[3] = yb[1].u32[0];
-   za[0].u32[2] = ya[2].u32[0];
-   zb[0].u32[2] = yb[2].u32[0];
-   za[0].u32[1] = ya[3].u32[0];
-   zb[0].u32[1] = yb[3].u32[0];
-
+   za[0].u32[1] = ya[3].u32[1];
+   zb[0].u32[1] = yb[3].u32[1];
+   za[0].u32[2] = ya[2].u32[2];
+   zb[0].u32[2] = yb[2].u32[2];
+   za[0].u32[3] = ya[1].u32[3];
+   zb[0].u32[3] = yb[1].u32[3];
+
+   za[1].u32[0] = ya[1].u32[0];
+   zb[1].u32[0] = yb[1].u32[0];
    za[1].u32[1] = ya[0].u32[1];
    zb[1].u32[1] = yb[0].u32[1];
-   za[1].u32[0] = ya[1].u32[1];
-   zb[1].u32[0] = yb[1].u32[1];
-   za[1].u32[3] = ya[2].u32[1];
-   zb[1].u32[3] = yb[2].u32[1];
-   za[1].u32[2] = ya[3].u32[1];
-   zb[1].u32[2] = yb[3].u32[1];
-
+   za[1].u32[2] = ya[3].u32[2];
+   zb[1].u32[2] = yb[3].u32[2];
+   za[1].u32[3] = ya[2].u32[3];
+   zb[1].u32[3] = yb[2].u32[3];
+
+   za[2].u32[0] = ya[2].u32[0];
+   zb[2].u32[0] = yb[2].u32[0];
+   za[2].u32[1] = ya[1].u32[1];
+   zb[2].u32[1] = yb[1].u32[1];
    za[2].u32[2] = ya[0].u32[2];
    zb[2].u32[2] = yb[0].u32[2];
-   za[2].u32[1] = ya[1].u32[2];
-   zb[2].u32[1] = yb[1].u32[2];
-   za[2].u32[0] = ya[2].u32[2];
-   zb[2].u32[0] = yb[2].u32[2];
-   za[2].u32[3] = ya[3].u32[2];
-   zb[2].u32[3] = yb[3].u32[2];
-
+   za[2].u32[3] = ya[3].u32[3];
+   zb[2].u32[3] = yb[3].u32[3];
+
+   za[3].u32[0] = ya[3].u32[0];
+   zb[3].u32[0] = yb[3].u32[0];
+   za[3].u32[1] = ya[2].u32[1];
+   zb[3].u32[1] = yb[2].u32[1];
+   za[3].u32[2] = ya[1].u32[2];
+   zb[3].u32[2] = yb[1].u32[2];
    za[3].u32[3] = ya[0].u32[3];
    zb[3].u32[3] = yb[0].u32[3];
-   za[3].u32[2] = ya[1].u32[3];
-   zb[3].u32[2] = yb[1].u32[3];
-   za[3].u32[1] = ya[2].u32[3];
-   zb[3].u32[1] = yb[2].u32[3];
-   za[3].u32[0] = ya[3].u32[3];
-   zb[3].u32[0] = yb[3].u32[3];
-
-   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
-   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
-   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
-   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
-   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
-   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
-   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
-   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
 
+   XA[0] = za[0].m128;
+   XB[0] = zb[0].m128;
+   XA[1] = za[1].m128;
+   XB[1] = zb[1].m128;
+   XA[2] = za[2].m128;
+   XB[2] = zb[2].m128;
+   XA[3] = za[3].m128;
+   XB[3] = zb[3].m128;
+   
+#endif
+}
+
+static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
+                       const uint32_t * const ca, const uint32_t * const cb )
+{
+   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   __m128i *BA = (__m128i*)ba;
+   __m128i *BB = (__m128i*)bb;
+   const __m128i *CA = (const __m128i*)ca;
+   const __m128i *CB = (const __m128i*)cb;
+
+   // define targets for macros used in round function template
+   #define ROL_1X32    mm128_shufll_32
+   #define ROR_1X32    mm128_shuflr_32
+   #define SWAP_64     mm128_swap_64
+   #define ROL32       mm128_rol_32
+   #define ADD32       _mm_add_epi32
+   #define XOR         _mm_xor_si128
+   #define TYPE        __m128i
+
+   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+             
+   SALSA_8ROUNDS_SIMD128_2BUF;
+
+#else
+
+   SALSA_8ROUNDS_SIMD128_2BUF_SLOROT;
+   
 #endif
 
+   BA[0] = _mm_add_epi32( BA[0], XA0 );
+   BB[0] = _mm_add_epi32( BB[0], XB0 );
+   BA[1] = _mm_add_epi32( BA[1], XA1 );
+   BB[1] = _mm_add_epi32( BB[1], XB1 );
+   BA[2] = _mm_add_epi32( BA[2], XA2 );
+   BB[2] = _mm_add_epi32( BB[2], XB2 );
+   BA[3] = _mm_add_epi32( BA[3], XA3 );
+   BB[3] = _mm_add_epi32( BB[3], XB3 );
+
    #undef ROL_1X32
    #undef ROR_1X32
    #undef SWAP_64
@@ -2822,570 +2603,425 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
    #undef TYPE
 }
 
-
-// X: 2 sequential buffers
-// V: 2 sequential buffers interleaved by the size of N
-// interleaved buffers { v00, v01, v10, v11, v20... }
-//
 void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-  uint32_t *X0 = X;
-  uint32_t *X1 = X+32;
-  uint32_t *V0 = V;
-  uint32_t *V1 = V + 32*N;
+   uint32_t *X0 = X;
+   uint32_t *X1 = X+32;
+   uint32_t *V0 = V;
+   uint32_t *V1 = V + 32*N;
 
-   for ( int i = 0; i < N; i++ )
+   salsa_simd128_shuffle_2buf( X0,    X1    );
+   salsa_simd128_shuffle_2buf( X0+16, X1+16 );
+
+   for ( int n = 0; n < N; n++ )
    {
-   #if defined(__AVX2__)
+   #if defined(__AVX__)
+
+      for ( int i = 0; i < 4; i++ )
+      {
+         _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) );
+         _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) );
+      }
 
-      for ( int k = 0; k < 4; k++ )
+   #elif defined(__SSE4_1__)
+
+      for ( int i = 0; i < 8; i++ )
       {
-         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
-         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
+         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
+         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
       }
 
    #else
 
-      memcpy( &V0[ i*32 ], X0, 128 );
-      memcpy( &V1[ i*32 ], X1, 128 );
+      memcpy( &V0[ n*32 ], X0, 128 );
+      memcpy( &V1[ n*32 ], X1, 128 );
 
    #endif
 
-      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
-      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+      salsa8_simd128_2buf( X0,    X1,    X0+16, X1+16 );
+      salsa8_simd128_2buf( X0+16, X1+16, X0   , X1    );
    }
 
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
    #if defined(__AVX2__)
 
       const int j0 = 4 * ( X0[16] & ( N-1 ) );
       const int j1 = 4 * ( X1[16] & ( N-1 ) );
-      for ( int k = 0; k < 4; k++ )
-      {
-         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
-         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
-//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
-//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
-         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
-         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
-      }
+
+      const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0   );
+      const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1   );
+      const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 );
+      const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 );
+      const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 );
+      const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 );
+      const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 );
+      const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 );
+
+      casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 );
+      casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 );
+      casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 );
+      casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 );
+      casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 );
+      casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 );
+      casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 );
+      casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 );
 
    #else
 
       const int j0 = 8 * ( X0[16] & ( N-1 ) );
       const int j1 = 8 * ( X1[16] & ( N-1 ) );
-      for ( int k = 0; k < 8; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
-         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
-         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
+         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
+         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
       }
 
    #endif
 
-/*      
-      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
-
-      for ( int k = 0; k < 16; k++ )
-      {
-         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
-         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
-         ( (uint64_t*)X0 )[k] ^= v0;
-         ( (uint64_t*)X1 )[k] ^= v1;
-      }
-*/
-
-/*
-      const int j0 = 32 * ( X0[16] & ( N-1 ) );
-      const int j1 = 32 * ( X1[16] & ( N-1 ) );
-
-      for ( int k = 0; k < 32; k++ )
-      {
-         const uint32_t v0 = V0[ j0+k ];
-         const uint32_t v1 = V1[ j1+k ]; 
-         X0[k] ^= v0;
-         X1[k] ^= v1;
-      }
-*/
-
-      salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
-      salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
+      salsa8_simd128_2buf( X0,    X1,    X0+16, X1+16 );
+      salsa8_simd128_2buf( X0+16, X1+16, X0   , X1    );
    }
+
+   salsa_simd128_unshuffle_2buf( X0,    X1    );
+   salsa_simd128_unshuffle_2buf( X0+16, X1+16 );
 }
 
 
-// Triple buffered, 3x memory usage
-// No interleaving
-static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
-               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
+static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
+                                        uint32_t *xc )
 {
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-           XC0, XC1, XC2, XC3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   __m128i *BC = (__m128i*)bc;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
-   const __m128i *CC = (const __m128i*)cc;
-
-   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
-
-   // mix C into B then shuffle B into X
-   BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   BC[0] = _mm_xor_si128( BC[0], CC[0] );
-   BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   BC[1] = _mm_xor_si128( BC[1], CC[1] );
-   BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   BC[2] = _mm_xor_si128( BC[2], CC[2] );
-   BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   BB[3] = _mm_xor_si128( BB[3], CB[3] );
-   BC[3] = _mm_xor_si128( BC[3], CC[3] );
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+   __m128i *XC = (__m128i*)xc;
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
 
 #if defined(__SSE4_1__)
 
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+   __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
 
 #if defined(__AVX2__)
 
-   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
-   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
-   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
-   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
-   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
-   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
-   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
-   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
-   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
-   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
-   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
-   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
-   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
-   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
-   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
-   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
-   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
-   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
-   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
-   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
-   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
-
-   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
-   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
-   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
-
-   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
-   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
-   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
-
-   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
-   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
-   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
-
-   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
-   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
-   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
+   YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
+   YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
+   YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
+   ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
+   ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
+   ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
+
+   YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
+   YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
+   ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
+   ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
+   ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
+
+   YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
+   YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
+   ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
+   ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
+   ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
+
+   YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
+   YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
+   YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
+   ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
+   ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
+   ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
+
+   XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
+   XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
+   XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
+
+   XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
+   XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
+   XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
+
+   XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
+   XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
+   XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
+
+   XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
+   XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
+   XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
+
+#else   
+
+//  SSE4.1
+
+   YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
+   YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
+   YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
+   ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
+   ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
+   ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
+
+   YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
+   YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
+   ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
+   ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
+   ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
+
+   YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
+   YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
+   ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
+   ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
+   ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
+
+   YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
+   YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
+   YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
+   ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
+   ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
+   ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
+
+   XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
+   XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
+   XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
+
+   XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
+   XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
+   XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
+
+   XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
+   XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
+   XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
+
+   XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
+   XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
+   XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
 
-#else   // SSE4_1
+#endif  // AVX2 else SSE4_1
 
-   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
-   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
-   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
-   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
-   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
-   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
-   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
-   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
-   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
-
-   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
-   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
-   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
-   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
-   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
-   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
-   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
-   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
-   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
-
-   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
-   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
-   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
-   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
-   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
-   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
-   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
-   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
-   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
-
-   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
-   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
-   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
-   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
-   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
-   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
-   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
-   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
-   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
-   
-#endif  // AVX2 else SSE3_1
+#else   // SSE2
+
+   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
+   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
+   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
+   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
+   YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] );
+
+   XA[0] = YA0;
+   XB[0] = YB0;
+   XC[0] = YC0;
+   XA[1] = YA1;
+   XB[1] = YB1;
+   XC[1] = YC1;
+   XA[2] = YA2;
+   XB[2] = YB2;
+   XC[2] = YC2;
+   XA[3] = YA3;
+   XB[3] = YB3;
+   XC[3] = YC3;
 
-   SALSA_8ROUNDS_SIMD128_3BUF;
+#endif
+}
+
+static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
+                                          uint32_t* xc )
+{
+   __m128i *XA = (__m128i*)xa;
+   __m128i *XB = (__m128i*)xb;
+   __m128i *XC = (__m128i*)xc;
+
+#if defined(__SSE4_1__)
+
+   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
 
 #if defined(__AVX2__)
 
-   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
-   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
-   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
-   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
-   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
-   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
-   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
-   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
-   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
-   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
-   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
-   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
-
-   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
-   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
-   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
-   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
-   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
-   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
-   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
-   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
-   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
-   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
-   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
-   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
+   YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
+   YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
+   YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
+   YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
+   YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
+   YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
+   YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
+   YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
+   YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
+   YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
+   YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
+   YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
+
+   YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
+   YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
+   YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
+   YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
+   YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
+   YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
+   YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
+   YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
+   YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
+   YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
+   YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
+   YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
+
+   XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
+   XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
+   XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
+   XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
+   XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
+   XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
+   XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
+   XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
+   XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
+   XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
+   XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
+   XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
 
 #else   // SSE4_1
 
-   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
-   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
-   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
-   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
-   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
-   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
-   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
-   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
-   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
-   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
-   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
-   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
-
-   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
-   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
-   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
-   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
-   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
-   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
-   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
-   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
-   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
-   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
-   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
-   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
+   YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
+   YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
+   YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
+   YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
+   YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
+   YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
+   YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
+   YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
+   YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
+   YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
+   YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
+   YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
+
+   YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
+   YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
+   YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
+   YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
+   YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
+   YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
+   YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
+   YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
+   YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
+   YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
+   YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
+   YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
+
+   XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
+   XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
+   XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
+   XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
+   XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
+   XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
+   XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
+   XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
+   XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
+   XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
+   XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
+   XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
 
 #endif  // AVX2 else SSE4_1
 
-   BA[0] = _mm_add_epi32( BA[0], YA0 );
-   BB[0] = _mm_add_epi32( BB[0], YB0 );
-   BC[0] = _mm_add_epi32( BC[0], YC0 );
-   BA[1] = _mm_add_epi32( BA[1], YA1 );
-   BB[1] = _mm_add_epi32( BB[1], YB1 );
-   BC[1] = _mm_add_epi32( BC[1], YC1 );
-   BA[2] = _mm_add_epi32( BA[2], YA2 );
-   BB[2] = _mm_add_epi32( BB[2], YB2 );
-   BC[2] = _mm_add_epi32( BC[2], YC2 );
-   BA[3] = _mm_add_epi32( BA[3], YA3 );
-   BB[3] = _mm_add_epi32( BB[3], YB3 );
-   BC[3] = _mm_add_epi32( BC[3], YC3 );
-
 #else  // SSE2
 
-   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4];
-
-   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
-   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
-   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
-   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
-   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
-   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
-   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
-   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
-   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
-   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
-   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
-   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
-
-   SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
-
-   // Final round doesn't shuffle data back to original input order,
-   // process it as is.
-
-   ya[0].m128 = XA0;
-   yb[0].m128 = XB0;
-   yc[0].m128 = XC0;   
-   ya[1].m128 = XA1;
-   yb[1].m128 = XB1;   
-   yc[1].m128 = XC1;   
-   ya[2].m128 = XA2;
-   yb[2].m128 = XB2;   
-   yc[2].m128 = XC2;   
-   ya[3].m128 = XA3;
-   yb[3].m128 = XB3;
-   yc[3].m128 = XC3;
+   m128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
+
+   ya[0].m128 = XA[0];
+   yb[0].m128 = XB[0];
+   yc[0].m128 = XC[0];
+   ya[1].m128 = XA[1];
+   yb[1].m128 = XB[1];
+   yc[1].m128 = XC[1];
+   ya[2].m128 = XA[2];
+   yb[2].m128 = XB[2];
+   yc[2].m128 = XC[2];
+   ya[3].m128 = XA[3];
+   yb[3].m128 = XB[3];
+   yc[3].m128 = XC[3];
 
    za[0].u32[0] = ya[0].u32[0];
    zb[0].u32[0] = yb[0].u32[0];
    zc[0].u32[0] = yc[0].u32[0];
-   za[0].u32[3] = ya[1].u32[0];
-   zb[0].u32[3] = yb[1].u32[0];
-   zc[0].u32[3] = yc[1].u32[0];
-   za[0].u32[2] = ya[2].u32[0];
-   zb[0].u32[2] = yb[2].u32[0];
-   zc[0].u32[2] = yc[2].u32[0];
-   za[0].u32[1] = ya[3].u32[0];
-   zb[0].u32[1] = yb[3].u32[0];
-   zc[0].u32[1] = yc[3].u32[0];
-
+   za[0].u32[1] = ya[3].u32[1];
+   zb[0].u32[1] = yb[3].u32[1];
+   zc[0].u32[1] = yc[3].u32[1];
+   za[0].u32[2] = ya[2].u32[2];
+   zb[0].u32[2] = yb[2].u32[2];
+   zc[0].u32[2] = yc[2].u32[2];
+   za[0].u32[3] = ya[1].u32[3];
+   zb[0].u32[3] = yb[1].u32[3];
+   zc[0].u32[3] = yc[1].u32[3];
+
+   za[1].u32[0] = ya[1].u32[0];
+   zb[1].u32[0] = yb[1].u32[0];
+   zc[1].u32[0] = yc[1].u32[0];
    za[1].u32[1] = ya[0].u32[1];
    zb[1].u32[1] = yb[0].u32[1];
    zc[1].u32[1] = yc[0].u32[1];
-   za[1].u32[0] = ya[1].u32[1];
-   zb[1].u32[0] = yb[1].u32[1];
-   zc[1].u32[0] = yc[1].u32[1];
-   za[1].u32[3] = ya[2].u32[1];
-   zb[1].u32[3] = yb[2].u32[1];
-   zc[1].u32[3] = yc[2].u32[1];
-   za[1].u32[2] = ya[3].u32[1];
-   zb[1].u32[2] = yb[3].u32[1];
-   zc[1].u32[2] = yc[3].u32[1];
-
+   za[1].u32[2] = ya[3].u32[2];
+   zb[1].u32[2] = yb[3].u32[2];
+   zc[1].u32[2] = yc[3].u32[2];
+   za[1].u32[3] = ya[2].u32[3];
+   zb[1].u32[3] = yb[2].u32[3];
+   zc[1].u32[3] = yc[2].u32[3];
+
+   za[2].u32[0] = ya[2].u32[0];
+   zb[2].u32[0] = yb[2].u32[0];
+   zc[2].u32[0] = yc[2].u32[0];
+   za[2].u32[1] = ya[1].u32[1];
+   zb[2].u32[1] = yb[1].u32[1];
+   zc[2].u32[1] = yc[1].u32[1];
    za[2].u32[2] = ya[0].u32[2];
    zb[2].u32[2] = yb[0].u32[2];
    zc[2].u32[2] = yc[0].u32[2];
-   za[2].u32[1] = ya[1].u32[2];
-   zb[2].u32[1] = yb[1].u32[2];
-   zc[2].u32[1] = yc[1].u32[2];
-   za[2].u32[0] = ya[2].u32[2];
-   zb[2].u32[0] = yb[2].u32[2];
-   zc[2].u32[0] = yc[2].u32[2];
-   za[2].u32[3] = ya[3].u32[2];
-   zb[2].u32[3] = yb[3].u32[2];
-   zc[2].u32[3] = yc[3].u32[2];
-
+   za[2].u32[3] = ya[3].u32[3];
+   zb[2].u32[3] = yb[3].u32[3];
+   zc[2].u32[3] = yc[3].u32[3];
+
+   za[3].u32[0] = ya[3].u32[0];
+   zb[3].u32[0] = yb[3].u32[0];
+   zc[3].u32[0] = yc[3].u32[0];
+   za[3].u32[1] = ya[2].u32[1];
+   zb[3].u32[1] = yb[2].u32[1];
+   zc[3].u32[1] = yc[2].u32[1];
+   za[3].u32[2] = ya[1].u32[2];
+   zb[3].u32[2] = yb[1].u32[2];
+   zc[3].u32[2] = yc[1].u32[2];
    za[3].u32[3] = ya[0].u32[3];
    zb[3].u32[3] = yb[0].u32[3];
    zc[3].u32[3] = yc[0].u32[3];
-   za[3].u32[2] = ya[1].u32[3];
-   zb[3].u32[2] = yb[1].u32[3];
-   zc[3].u32[2] = yc[1].u32[3];
-   za[3].u32[1] = ya[2].u32[3];
-   zb[3].u32[1] = yb[2].u32[3];
-   zc[3].u32[1] = yc[2].u32[3];
-   za[3].u32[0] = ya[3].u32[3];
-   zb[3].u32[0] = yb[3].u32[3];
-   zc[3].u32[0] = yc[3].u32[3];
-
-   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
-   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
-   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
-   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
-   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
-   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
-   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
-   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
-   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
-   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
-   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
-   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
-
-#endif
-
-   #undef ROL_1X32
-   #undef ROR_1X32
-   #undef SWAP_64
-   #undef ROL32
-   #undef ADD32
-   #undef XOR
-   #undef TYPE
-}
-
-void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
-{
-  uint32_t *X0 = X;
-  uint32_t *X1 = X+32;
-  uint32_t *X2 = X+64;
-  uint32_t *V0 = V;
-  uint32_t *V1 = V + 32*N;
-  uint32_t *V2 = V + 64*N;
-
-   for ( int i = 0; i < N; i++ )
-   {
-   #if defined(__AVX2__) 
-
-      for ( int k = 0; k < 4; k++ )
-      {
-         _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) );
-         _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) );
-         _mm256_stream_si256( (__m256i*)V2 + i*4 + k, casti_m256i( X2, k ) );
-      }
-
-   #else
-
-      memcpy( &V0[ i*32 ], X0, 128 );
-      memcpy( &V1[ i*32 ], X1, 128 );
-      memcpy( &V2[ i*32 ], X2, 128 );
-
-   #endif
-
-      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
-                           &X0[16], &X1[16], &X2[16] );
-      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0] );
-   }
-
-   for ( int i = 0; i < N; i++ )
-   {
-   #if defined(__AVX2__)
-
-      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
-
-      for ( int k = 0; k < 4; k++ )
-      {
-         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
-         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k );
-         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
-//         const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k );
-//         const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k );
-//         const __m256i v2 = _mm256_load_si256( ( (__m256i*)V2 ) +j2+k );
-         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
-         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
-         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
-      }
-
-   #else
-
-      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
-      for ( int k = 0; k < 8; k++ )
-      {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
-         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
-         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
-         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
-         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
-      }
-
-   #endif
-
-/*      
-      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
-
-      for ( int k = 0; k < 16; k++ )
-      {
-         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
-         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];         
-         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
-         ( (uint64_t*)X0 )[k] ^= v0;
-         ( (uint64_t*)X1 )[k] ^= v1;
-         ( (uint64_t*)X2 )[k] ^= v2;
-      }
-*/      
-
-/*
-      const int j0 = 32 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 32 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 32 * ( X2[16] & ( N - 1 ) );
 
-      for ( int k = 0; k < 32; k++ )
-      {
-         const uint32_t v0 = V0[ j0+k ];
-         const uint32_t v1 = V1[ j1+k ];         
-         const uint32_t v2 = V2[ j2+k ];
-         X0[k] ^= v0;
-         X1[k] ^= v1;
-         X2[k] ^= v2;
-      }
-*/
-   
-      salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
-                           &X0[16], &X1[16], &X2[16] );
-      salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0] );
-   }
-}
+   XA[0] = za[0].m128;
+   XB[0] = zb[0].m128;
+   XC[0] = zc[0].m128;
+   XA[1] = za[1].m128;
+   XB[1] = zb[1].m128;
+   XC[1] = zc[1].m128;
+   XA[2] = za[2].m128;
+   XB[2] = zb[2].m128;
+   XC[2] = zc[2].m128;
+   XA[3] = za[3].m128;
+   XB[3] = zb[3].m128;
+   XC[3] = zc[3].m128;
+
+#endif   
+}   
 
-// Working.
-// Quadruple buffered, 4x memory usage
+// Triple buffered, 3x memory usage
 // No interleaving
-static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
-                     uint32_t *bd, const uint32_t *ca, const uint32_t *cb,
-                     const uint32_t *cc,  const uint32_t *cd )
+static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
+               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
 {
    __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
-           XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3;
+           XC0, XC1, XC2, XC3;
    __m128i *BA = (__m128i*)ba;
    __m128i *BB = (__m128i*)bb;
    __m128i *BC = (__m128i*)bc;
-   __m128i *BD = (__m128i*)bd;
    const __m128i *CA = (const __m128i*)ca;
    const __m128i *CB = (const __m128i*)cb;
    const __m128i *CC = (const __m128i*)cc;
-   const __m128i *CD = (const __m128i*)cd;
 
    // define targets for macros used in round function template
    #define ROL_1X32    mm128_shufll_32
@@ -3396,397 +3032,42 @@ static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
    #define XOR         _mm_xor_si128
    #define TYPE        __m128i
 
-   // mix C into B then shuffle B into X
-   BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   BC[0] = _mm_xor_si128( BC[0], CC[0] );
-   BD[0] = _mm_xor_si128( BD[0], CD[0] );
-   BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   BC[1] = _mm_xor_si128( BC[1], CC[1] );
-   BD[1] = _mm_xor_si128( BD[1], CD[1] );
-   BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   BC[2] = _mm_xor_si128( BC[2], CC[2] );
-   BD[2] = _mm_xor_si128( BD[2], CD[2] );
-   BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   BB[3] = _mm_xor_si128( BB[3], CB[3] );
-   BC[3] = _mm_xor_si128( BC[3], CC[3] );
-   BD[3] = _mm_xor_si128( BD[3], CD[3] );
-
-#if defined(__SSE4_1__)
-
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3,
-           YC0, YC1, YC2, YC3, YD0, YD1, YD2, YD3;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 );
-   YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 );
-   YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 );
-   YD0 = _mm_blend_epi32( BD[1], BD[0], 0x1 );
-   XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 );
-   XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 );
-   XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 );
-   XD0 = _mm_blend_epi32( BD[3], BD[2], 0x4 );
-
-   YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 );
-   YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 );
-   YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 );
-   YD1 = _mm_blend_epi32( BD[2], BD[1], 0x1 );
-   XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 );
-   XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 );
-   XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 );
-   XD1 = _mm_blend_epi32( BD[0], BD[3], 0x4 );
-
-   YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 );
-   YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 );
-   YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 );
-   YD2 = _mm_blend_epi32( BD[3], BD[2], 0x1 );
-   XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 );
-   XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 );
-   XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 );
-   XD2 = _mm_blend_epi32( BD[1], BD[0], 0x4 );
-
-   YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 );
-   YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 );
-   YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 );
-   YD3 = _mm_blend_epi32( BD[0], BD[3], 0x1 );
-   XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 );
-   XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 );
-   XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 );
-   XD3 = _mm_blend_epi32( BD[2], BD[1], 0x4 );
-
-   XA0 = _mm_blend_epi32( XA0, YA0, 0x3 );
-   XB0 = _mm_blend_epi32( XB0, YB0, 0x3 );
-   XC0 = _mm_blend_epi32( XC0, YC0, 0x3 );
-   XD0 = _mm_blend_epi32( XD0, YD0, 0x3 );
-
-   XA1 = _mm_blend_epi32( XA1, YA1, 0x3 );
-   XB1 = _mm_blend_epi32( XB1, YB1, 0x3 );
-   XC1 = _mm_blend_epi32( XC1, YC1, 0x3 );
-   XD1 = _mm_blend_epi32( XD1, YD1, 0x3 );
-
-   XA2 = _mm_blend_epi32( XA2, YA2, 0x3 );
-   XB2 = _mm_blend_epi32( XB2, YB2, 0x3 );
-   XC2 = _mm_blend_epi32( XC2, YC2, 0x3 );
-   XD2 = _mm_blend_epi32( XD2, YD2, 0x3 );
-
-   XA3 = _mm_blend_epi32( XA3, YA3, 0x3 );
-   XB3 = _mm_blend_epi32( XB3, YB3, 0x3 );
-   XC3 = _mm_blend_epi32( XC3, YC3, 0x3 );
-   XD3 = _mm_blend_epi32( XD3, YD3, 0x3 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 );
-   YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 );
-   YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 );
-   YD0 = _mm_blend_epi16( BD[1], BD[0], 0x03 );
-   XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 );
-   XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 );
-   XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 );
-   XD0 = _mm_blend_epi16( BD[3], BD[2], 0x30 );
-   XA0 = _mm_blend_epi16( XA0, YA0, 0x0f );
-   XB0 = _mm_blend_epi16( XB0, YB0, 0x0f );
-   XC0 = _mm_blend_epi16( XC0, YC0, 0x0f );
-   XD0 = _mm_blend_epi16( XD0, YD0, 0x0f );
-
-   YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 );
-   YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 );
-   YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 );
-   YD1 = _mm_blend_epi16( BD[2], BD[1], 0x03 );
-   XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 );
-   XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 );
-   XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 );
-   XD1 = _mm_blend_epi16( BD[0], BD[3], 0x30 );
-   XA1 = _mm_blend_epi16( XA1, YA1, 0x0f );
-   XB1 = _mm_blend_epi16( XB1, YB1, 0x0f );
-   XC1 = _mm_blend_epi16( XC1, YC1, 0x0f );
-   XD1 = _mm_blend_epi16( XD1, YD1, 0x0f );
-
-   YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 );
-   YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 );
-   YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 );
-   YD2 = _mm_blend_epi16( BD[3], BD[2], 0x03 );
-   XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 );
-   XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 );
-   XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 );
-   XD2 = _mm_blend_epi16( BD[1], BD[0], 0x30 );
-   XA2 = _mm_blend_epi16( XA2, YA2, 0x0f );
-   XB2 = _mm_blend_epi16( XB2, YB2, 0x0f );
-   XC2 = _mm_blend_epi16( XC2, YC2, 0x0f );
-   XD2 = _mm_blend_epi16( XD2, YD2, 0x0f );
-
-   YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 );
-   YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 );
-   YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 );
-   YD3 = _mm_blend_epi16( BD[0], BD[3], 0x03 );
-   XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 );
-   XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 );
-   XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 );
-   XD3 = _mm_blend_epi16( BD[2], BD[1], 0x30 );
-   XA3 = _mm_blend_epi16( XA3, YA3, 0x0f );
-   XB3 = _mm_blend_epi16( XB3, YB3, 0x0f );
-   XC3 = _mm_blend_epi16( XC3, YC3, 0x0f );
-   XD3 = _mm_blend_epi16( XD3, YD3, 0x0f );
-
-#endif  // AVX2 else SSE3_1
-
-   SALSA_8ROUNDS_SIMD128_4BUF;
-
-#if defined(__AVX2__)
-
-   YA0 = _mm_blend_epi32( XA0, XA1, 0x8 );
-   YB0 = _mm_blend_epi32( XB0, XB1, 0x8 );
-   YC0 = _mm_blend_epi32( XC0, XC1, 0x8 );
-   YD0 = _mm_blend_epi32( XD0, XD1, 0x8 );
-   YA1 = _mm_blend_epi32( XA0, XA1, 0x1 );
-   YB1 = _mm_blend_epi32( XB0, XB1, 0x1 );
-   YC1 = _mm_blend_epi32( XC0, XC1, 0x1 );
-   YD1 = _mm_blend_epi32( XD0, XD1, 0x1 );
-   YA2 = _mm_blend_epi32( XA0, XA1, 0x2 );
-   YB2 = _mm_blend_epi32( XB0, XB1, 0x2 );
-   YC2 = _mm_blend_epi32( XC0, XC1, 0x2 );
-   YD2 = _mm_blend_epi32( XD0, XD1, 0x2 );
-   YA3 = _mm_blend_epi32( XA0, XA1, 0x4 );
-   YB3 = _mm_blend_epi32( XB0, XB1, 0x4 );
-   YC3 = _mm_blend_epi32( XC0, XC1, 0x4 );
-   YD3 = _mm_blend_epi32( XD0, XD1, 0x4 );
-
-   YA0 = _mm_blend_epi32( YA0, XA2, 0x4 );
-   YB0 = _mm_blend_epi32( YB0, XB2, 0x4 );
-   YC0 = _mm_blend_epi32( YC0, XC2, 0x4 );
-   YD0 = _mm_blend_epi32( YD0, XD2, 0x4 );
-   YA1 = _mm_blend_epi32( YA1, XA2, 0x8 );
-   YB1 = _mm_blend_epi32( YB1, XB2, 0x8 );
-   YC1 = _mm_blend_epi32( YC1, XC2, 0x8 );
-   YD1 = _mm_blend_epi32( YD1, XD2, 0x8 );
-   YA2 = _mm_blend_epi32( YA2, XA2, 0x1 );
-   YB2 = _mm_blend_epi32( YB2, XB2, 0x1 );
-   YC2 = _mm_blend_epi32( YC2, XC2, 0x1 );
-   YD2 = _mm_blend_epi32( YD2, XD2, 0x1 );
-   YA3 = _mm_blend_epi32( YA3, XA2, 0x2 );
-   YB3 = _mm_blend_epi32( YB3, XB2, 0x2 );
-   YC3 = _mm_blend_epi32( YC3, XC2, 0x2 );
-   YD3 = _mm_blend_epi32( YD3, XD2, 0x2 );
-
-   YA0 = _mm_blend_epi32( YA0, XA3, 0x2 );
-   YB0 = _mm_blend_epi32( YB0, XB3, 0x2 );
-   YC0 = _mm_blend_epi32( YC0, XC3, 0x2 );
-   YD0 = _mm_blend_epi32( YD0, XD3, 0x2 );
-   YA1 = _mm_blend_epi32( YA1, XA3, 0x4 );
-   YB1 = _mm_blend_epi32( YB1, XB3, 0x4 );
-   YC1 = _mm_blend_epi32( YC1, XC3, 0x4 );
-   YD1 = _mm_blend_epi32( YD1, XD3, 0x4 );
-   YA2 = _mm_blend_epi32( YA2, XA3, 0x8 );
-   YB2 = _mm_blend_epi32( YB2, XB3, 0x8 );
-   YC2 = _mm_blend_epi32( YC2, XC3, 0x8 );
-   YD2 = _mm_blend_epi32( YD2, XD3, 0x8 );
-   YA3 = _mm_blend_epi32( YA3, XA3, 0x1 );
-   YB3 = _mm_blend_epi32( YB3, XB3, 0x1 );
-   YC3 = _mm_blend_epi32( YC3, XC3, 0x1 );
-   YD3 = _mm_blend_epi32( YD3, XD3, 0x1 );
-
-#else   // SSE4_1
-
-   YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 );
-   YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 );
-   YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 );
-   YD0 = _mm_blend_epi16( XD0, XD1, 0xc0 );
-   YA1 = _mm_blend_epi16( XA0, XA1, 0x03 );
-   YB1 = _mm_blend_epi16( XB0, XB1, 0x03 );
-   YC1 = _mm_blend_epi16( XC0, XC1, 0x03 );
-   YD1 = _mm_blend_epi16( XD0, XD1, 0x03 );
-   YA2 = _mm_blend_epi16( XA0, XA1, 0x0c );
-   YB2 = _mm_blend_epi16( XB0, XB1, 0x0c );
-   YC2 = _mm_blend_epi16( XC0, XC1, 0x0c );
-   YD2 = _mm_blend_epi16( XD0, XD1, 0x0c );
-   YA3 = _mm_blend_epi16( XA0, XA1, 0x30 );
-   YB3 = _mm_blend_epi16( XB0, XB1, 0x30 );
-   YC3 = _mm_blend_epi16( XC0, XC1, 0x30 );
-   YD3 = _mm_blend_epi16( XD0, XD1, 0x30 );
-
-   YA0 = _mm_blend_epi16( YA0, XA2, 0x30 );
-   YB0 = _mm_blend_epi16( YB0, XB2, 0x30 );
-   YC0 = _mm_blend_epi16( YC0, XC2, 0x30 );
-   YD0 = _mm_blend_epi16( YD0, XD2, 0x30 );
-   YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 );
-   YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 );
-   YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 );
-   YD1 = _mm_blend_epi16( YD1, XD2, 0xc0 );
-   YA2 = _mm_blend_epi16( YA2, XA2, 0x03 );
-   YB2 = _mm_blend_epi16( YB2, XB2, 0x03 );
-   YC2 = _mm_blend_epi16( YC2, XC2, 0x03 );
-   YD2 = _mm_blend_epi16( YD2, XD2, 0x03 );
-   YA3 = _mm_blend_epi16( YA3, XA2, 0x0c );
-   YB3 = _mm_blend_epi16( YB3, XB2, 0x0c );
-   YC3 = _mm_blend_epi16( YC3, XC2, 0x0c );
-   YD3 = _mm_blend_epi16( YD3, XD2, 0x0c );
-
-   YA0 = _mm_blend_epi16( YA0, XA3, 0x0c );
-   YB0 = _mm_blend_epi16( YB0, XB3, 0x0c );
-   YC0 = _mm_blend_epi16( YC0, XC3, 0x0c );
-   YD0 = _mm_blend_epi16( YD0, XD3, 0x0c );
-   YA1 = _mm_blend_epi16( YA1, XA3, 0x30 );
-   YB1 = _mm_blend_epi16( YB1, XB3, 0x30 );
-   YC1 = _mm_blend_epi16( YC1, XC3, 0x30 );
-   YD1 = _mm_blend_epi16( YD1, XD3, 0x30 );
-   YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 );
-   YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 );
-   YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 );
-   YD2 = _mm_blend_epi16( YD2, XD3, 0xc0 );
-   YA3 = _mm_blend_epi16( YA3, XA3, 0x03 );
-   YB3 = _mm_blend_epi16( YB3, XB3, 0x03 );
-   YC3 = _mm_blend_epi16( YC3, XC3, 0x03 );
-   YD3 = _mm_blend_epi16( YD3, XD3, 0x03 );
-
-#endif  // AVX2 else SSE4_1
-
-   BA[0] = _mm_add_epi32( BA[0], YA0 );
-   BB[0] = _mm_add_epi32( BB[0], YB0 );
-   BC[0] = _mm_add_epi32( BC[0], YC0 );
-   BD[0] = _mm_add_epi32( BD[0], YD0 );
-   BA[1] = _mm_add_epi32( BA[1], YA1 );
-   BB[1] = _mm_add_epi32( BB[1], YB1 );
-   BC[1] = _mm_add_epi32( BC[1], YC1 );
-   BD[1] = _mm_add_epi32( BD[1], YD1 );
-   BA[2] = _mm_add_epi32( BA[2], YA2 );
-   BB[2] = _mm_add_epi32( BB[2], YB2 );
-   BC[2] = _mm_add_epi32( BC[2], YC2 );
-   BD[2] = _mm_add_epi32( BD[2], YD2 );
-   BA[3] = _mm_add_epi32( BA[3], YA3 );
-   BB[3] = _mm_add_epi32( BB[3], YB3 );
-   BC[3] = _mm_add_epi32( BC[3], YC3 );
-   BD[3] = _mm_add_epi32( BD[3], YD3 );
-
-#else  // SSE2
-
-   m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4], yd[4], zd[4];
-
-   XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] );
-   XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] );
-   XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] );
-   XD0 = _mm_set_epi32( bd[15], bd[10], bd[ 5], bd[ 0] );
-   XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] );
-   XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] );
-   XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] );
-   XD1 = _mm_set_epi32( bd[ 3], bd[14], bd[ 9], bd[ 4] );
-   XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] );
-   XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] );
-   XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] );
-   XD2 = _mm_set_epi32( bd[ 7], bd[ 2], bd[13], bd[ 8] );
-   XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] );
-   XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] );
-   XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] );
-   XD3 = _mm_set_epi32( bd[11], bd[ 6], bd[ 1], bd[12] );
-
-   SALSA_8ROUNDS_FINAL_SIMD128_4BUF;
-
-   ya[0].m128 = XA0;
-   yb[0].m128 = XB0;
-   yc[0].m128 = XC0;
-   yd[0].m128 = XD0;
-   ya[1].m128 = XA1;
-   yb[1].m128 = XB1;
-   yc[1].m128 = XC1;
-   yd[1].m128 = XD1;
-   ya[2].m128 = XA2;
-   yb[2].m128 = XB2;
-   yc[2].m128 = XC2;
-   yd[2].m128 = XD2;
-   ya[3].m128 = XA3;
-   yb[3].m128 = XB3;
-   yc[3].m128 = XC3;
-   yd[3].m128 = XD3;
-
-   za[0].u32[0] = ya[0].u32[0];
-   zb[0].u32[0] = yb[0].u32[0];
-   zc[0].u32[0] = yc[0].u32[0];
-   zd[0].u32[0] = yd[0].u32[0];
-   za[0].u32[3] = ya[1].u32[0];
-   zb[0].u32[3] = yb[1].u32[0];
-   zc[0].u32[3] = yc[1].u32[0];
-   zd[0].u32[3] = yd[1].u32[0];
-   za[0].u32[2] = ya[2].u32[0];
-   zb[0].u32[2] = yb[2].u32[0];
-   zc[0].u32[2] = yc[2].u32[0];
-   zd[0].u32[2] = yd[2].u32[0];
-   za[0].u32[1] = ya[3].u32[0];
-   zb[0].u32[1] = yb[3].u32[0];
-   zc[0].u32[1] = yc[3].u32[0];
-   zd[0].u32[1] = yd[3].u32[0];
-
-   za[1].u32[1] = ya[0].u32[1];
-   zb[1].u32[1] = yb[0].u32[1];
-   zc[1].u32[1] = yc[0].u32[1];
-   zd[1].u32[1] = yd[0].u32[1];
-   za[1].u32[0] = ya[1].u32[1];
-   zb[1].u32[0] = yb[1].u32[1];
-   zc[1].u32[0] = yc[1].u32[1];
-   zd[1].u32[0] = yd[1].u32[1];
-   za[1].u32[3] = ya[2].u32[1];
-   zb[1].u32[3] = yb[2].u32[1];
-   zc[1].u32[3] = yc[2].u32[1];
-   zd[1].u32[3] = yd[2].u32[1];
-   za[1].u32[2] = ya[3].u32[1];
-   zb[1].u32[2] = yb[3].u32[1];
-   zc[1].u32[2] = yc[3].u32[1];
-   zd[1].u32[2] = yd[3].u32[1];
+   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
+   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
+   XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] );
+   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
+   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
+   XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] );
+   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
+   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
+   XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] );
+   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
+   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] );
+      
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   
+   SALSA_8ROUNDS_SIMD128_3BUF;
 
-   za[2].u32[2] = ya[0].u32[2];
-   zb[2].u32[2] = yb[0].u32[2];
-   zc[2].u32[2] = yc[0].u32[2];
-   zd[2].u32[2] = yd[0].u32[2];
-   za[2].u32[1] = ya[1].u32[2];
-   zb[2].u32[1] = yb[1].u32[2];
-   zc[2].u32[1] = yc[1].u32[2];
-   zd[2].u32[1] = yd[1].u32[2];
-   za[2].u32[0] = ya[2].u32[2];
-   zb[2].u32[0] = yb[2].u32[2];
-   zc[2].u32[0] = yc[2].u32[2];
-   zd[2].u32[0] = yd[2].u32[2];
-   za[2].u32[3] = ya[3].u32[2];
-   zb[2].u32[3] = yb[3].u32[2];
-   zc[2].u32[3] = yc[3].u32[2];
-   zd[2].u32[3] = yd[3].u32[2];
+#else
 
-   za[3].u32[3] = ya[0].u32[3];
-   zb[3].u32[3] = yb[0].u32[3];
-   zc[3].u32[3] = yc[0].u32[3];
-   zd[3].u32[3] = yd[0].u32[3];
-   za[3].u32[2] = ya[1].u32[3];
-   zb[3].u32[2] = yb[1].u32[3];
-   zc[3].u32[2] = yc[1].u32[3];
-   zd[3].u32[2] = yd[1].u32[3];
-   za[3].u32[1] = ya[2].u32[3];
-   zb[3].u32[1] = yb[2].u32[3];
-   zc[3].u32[1] = yc[2].u32[3];
-   zd[3].u32[1] = yd[2].u32[3];
-   za[3].u32[0] = ya[3].u32[3];
-   zb[3].u32[0] = yb[3].u32[3];
-   zc[3].u32[0] = yc[3].u32[3];
-   zd[3].u32[0] = yd[3].u32[3];
-
-   BA[0] = _mm_add_epi32( BA[0], za[0].m128 );
-   BB[0] = _mm_add_epi32( BB[0], zb[0].m128 );
-   BC[0] = _mm_add_epi32( BC[0], zc[0].m128 );
-   BD[0] = _mm_add_epi32( BD[0], zd[0].m128 );
-   BA[1] = _mm_add_epi32( BA[1], za[1].m128 );
-   BB[1] = _mm_add_epi32( BB[1], zb[1].m128 );
-   BC[1] = _mm_add_epi32( BC[1], zc[1].m128 );
-   BD[1] = _mm_add_epi32( BD[1], zd[1].m128 );
-   BA[2] = _mm_add_epi32( BA[2], za[2].m128 );
-   BB[2] = _mm_add_epi32( BB[2], zb[2].m128 );
-   BC[2] = _mm_add_epi32( BC[2], zc[2].m128 );
-   BD[2] = _mm_add_epi32( BD[2], zd[2].m128 );
-   BA[3] = _mm_add_epi32( BA[3], za[3].m128 );
-   BB[3] = _mm_add_epi32( BB[3], zb[3].m128 );
-   BC[3] = _mm_add_epi32( BC[3], zc[3].m128 );
-   BD[3] = _mm_add_epi32( BD[3], zd[3].m128 );
+   SALSA_8ROUNDS_SIMD128_3BUF_SLOROT;
 
 #endif
 
+   BA[0] = _mm_add_epi32( BA[0], XA0 );
+   BB[0] = _mm_add_epi32( BB[0], XB0 );
+   BC[0] = _mm_add_epi32( BC[0], XC0 );
+   BA[1] = _mm_add_epi32( BA[1], XA1 );
+   BB[1] = _mm_add_epi32( BB[1], XB1 );
+   BC[1] = _mm_add_epi32( BC[1], XC1 );
+   BA[2] = _mm_add_epi32( BA[2], XA2 );
+   BB[2] = _mm_add_epi32( BB[2], XB2 );
+   BC[2] = _mm_add_epi32( BC[2], XC2 );
+   BA[3] = _mm_add_epi32( BA[3], XA3 );
+   BB[3] = _mm_add_epi32( BB[3], XB3 );
+   BC[3] = _mm_add_epi32( BC[3], XC3 );
+
    #undef ROL_1X32
    #undef ROR_1X32
    #undef SWAP_64
@@ -3796,105 +3077,108 @@ static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
    #undef TYPE
 }
 
-void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N )
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-  uint32_t *X0 = X;
-  uint32_t *X1 = X+32;
-  uint32_t *X2 = X+64;
-  uint32_t *X3 = X+96;
-  uint32_t *V0 = V;
-  uint32_t *V1 = V + 32*N;
-  uint32_t *V2 = V + 64*N;
-  uint32_t *V3 = V + 96*N;
-
-   for ( int i = 0; i < N; i++ )
+   uint32_t *X0 = X;
+   uint32_t *X1 = X+32;
+   uint32_t *X2 = X+64;
+   uint32_t *V0 = V;
+   uint32_t *V1 = V + 32*N;
+   uint32_t *V2 = V + 64*N;
+
+   salsa_simd128_shuffle_3buf( X0,    X1,    X2    );
+   salsa_simd128_shuffle_3buf( X0+16, X1+16, X2+16 );
+  
+   for ( int n = 0; n < N; n++ )
    {
-      for ( int k = 0; k < 8; k++ )
+   #if defined(__AVX__) 
+
+      for ( int i = 0; i < 4; i++ )
       {
-         _mm_stream_si128( (__m128i*)V0 + i*8 + k, casti_m128i( X0, k ) );
-         _mm_stream_si128( (__m128i*)V1 + i*8 + k, casti_m128i( X1, k ) );
-         _mm_stream_si128( (__m128i*)V2 + i*8 + k, casti_m128i( X2, k ) );
-         _mm_stream_si128( (__m128i*)V3 + i*8 + k, casti_m128i( X3, k ) );
+         _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) );
+         _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) );
+         _mm256_stream_si256( (__m256i*)V2 + n*4 + i, casti_m256i( X2, i ) );
       }
 
-      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                           &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
-   }
-   for ( int i = 0; i < N; i++ )
-   {
-   #if defined(__AVX2__)
-
-      const int j0 = 4 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 4 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 4 * ( X2[16] & ( N - 1 ) );
-      const int j3 = 4 * ( X3[16] & ( N - 1 ) );
+   #elif defined(__SSE4_1__)
 
-      for ( int k = 0; k < 4; k++ )
+      for ( int i = 0; i < 8; i++ )
       {
-         const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k );
-         const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); 
-         const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k );
-         const __m256i v3 = _mm256_stream_load_si256( ( (__m256i*)V3 ) +j3+k );
-         casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 );
-         casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 );
-         casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 );
-         casti_m256i( X3, k ) = _mm256_xor_si256( casti_m256i( X3, k ), v3 );
+         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
+         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
+         _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) );
       }
 
    #else
-      
-      const int j0 = 8 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 8 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 8 * ( X2[16] & ( N - 1 ) );
-      const int j3 = 8 * ( X3[16] & ( N - 1 ) );
 
-      for ( int k = 0; k < 8; k++ )
-      {
-      #if defined(__SSE4_1__)
-         const __m128i v0 = _mm_stream_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_stream_load_si128( ( (__m128i*)V1 ) +j1+k );
-         const __m128i v2 = _mm_stream_load_si128( ( (__m128i*)V2 ) +j2+k );
-         const __m128i v3 = _mm_stream_load_si128( ( (__m128i*)V3 ) +j3+k );
-      #else
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k );
-         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k );
-         const __m128i v3 = _mm_load_si128( ( (__m128i*)V3 ) +j3+k );
-      #endif
-         casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 );
-         casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 );
-         casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 );
-         casti_m128i( X3, k ) = _mm_xor_si128( casti_m128i( X3, k ), v3 );
-      }
+      memcpy( &V0[ n*32 ], X0, 128 );
+      memcpy( &V1[ n*32 ], X1, 128 );
+      memcpy( &V2[ n*32 ], X2, 128 );
 
-   #endif      
+   #endif
 
-/*
-      const int j0 = 16 * ( X0[16] & ( N - 1 ) );
-      const int j1 = 16 * ( X1[16] & ( N - 1 ) );
-      const int j2 = 16 * ( X2[16] & ( N - 1 ) );
-      const int j3 = 16 * ( X3[16] & ( N - 1 ) );
+      salsa8_simd128_3buf( X0,    X1,    X2   , X0+16, X1+16, X2+16 );
+      salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0,    X1,    X2    );
+   }
 
-      for ( int k = 0; k < 16; k++ )
+   for ( int n = 0; n < N; n++ )
+   {
+   #if defined(__AVX2__)
+
+      const int j0 = 4 * ( X0[16] & ( N-1 ) );
+      const int j1 = 4 * ( X1[16] & ( N-1 ) );
+      const int j2 = 4 * ( X2[16] & ( N-1 ) );
+
+      const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0   );
+      const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1   );
+      const __m256i v20 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2   );
+      const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 );
+      const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 );
+      const __m256i v21 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+1 );
+      const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 );
+      const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 );
+      const __m256i v22 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+2 );
+      const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 );
+      const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 );
+      const __m256i v23 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+3 );
+
+      casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 );
+      casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 );
+      casti_m256i( X2, 0 ) = _mm256_xor_si256( casti_m256i( X2, 0 ), v20 );
+      casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 );
+      casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 );
+      casti_m256i( X2, 1 ) = _mm256_xor_si256( casti_m256i( X2, 1 ), v21 );
+      casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 );
+      casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 );
+      casti_m256i( X2, 2 ) = _mm256_xor_si256( casti_m256i( X2, 2 ), v22 );
+      casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 );
+      casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 );
+      casti_m256i( X2, 3 ) = _mm256_xor_si256( casti_m256i( X2, 3 ), v23 );
+
+   #else
+
+      const int j0 = 8 * ( X0[16] & ( N-1 ) );
+      const int j1 = 8 * ( X1[16] & ( N-1 ) );
+      const int j2 = 8 * ( X2[16] & ( N-1 ) );
+      for ( int i = 0; i < 8; i++ )
       {
-         const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ];
-         const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ];
-         const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ];
-         const uint64_t v3 = ( (uint64_t*)V3 )[ j3+k ];
-         ( (uint64_t*)X0 )[k] ^= v0;
-         ( (uint64_t*)X1 )[k] ^= v1;
-         ( (uint64_t*)X2 )[k] ^= v2;
-         ( (uint64_t*)X3 )[k] ^= v3;
+         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
+         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
+         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i );
+         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
+         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
+         casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 );
       }
-*/
 
-      salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0],
-                           &X0[16], &X1[16], &X2[16], &X3[16] );
-      salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16],
-                           &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] );
+   #endif
+
+      salsa8_simd128_3buf( X0,    X1,    X2   , X0+16, X1+16, X2+16 );
+      salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0,    X1,    X2    );
    }
+
+   salsa_simd128_unshuffle_3buf( X0,    X1,    X2    );
+   salsa_simd128_unshuffle_3buf( X0+16, X1+16, X2+16 );
+
 }
 
 
@@ -3961,17 +3245,17 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
 
 void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N )
 {
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
-      memcpy( &V[i * 32], X, 128 );
+      memcpy( &V[ n*32 ], X, 128 );
       xor_salsa8( &X[ 0], &X[16] );
       xor_salsa8( &X[16], &X[ 0] );
    }
-   for ( int i = 0; i < N; i++ )
+   for ( int n = 0; n < N; n++ )
    {
       int j = 32 * ( X[16] & ( N - 1 ) );
-      for ( int k = 0; k < 32; k++ )
-         X[k] ^= V[j + k];
+      for ( int i = 0; i < 32; i++ )
+         X[i] ^= V[ j+i ];
       xor_salsa8( &X[ 0], &X[16] );
       xor_salsa8( &X[16], &X[ 0] );
    }
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index a15b5cb1..e919ccb3 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -146,6 +146,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
       output[i] = bswap_32( ostate[i] );
 }
 
+#if defined(__SHA__)
+
+static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, 
+                    const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
+                    uint32_t *ostate0, uint32_t *ostate1 )
+{
+   uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16];
+   int i;
+
+   memcpy( pad0, key0 + 16, 16 );
+   memcpy( pad0 + 4, keypad, 48 );
+   memcpy( pad1, key1 + 16, 16 );
+   memcpy( pad1 + 4, keypad, 48 );
+
+   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
+		               tstate0, tstate1 );
+
+   memcpy( ihash0, tstate0, 32 );
+   memcpy( ihash1, tstate1, 32 );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      pad0[i] = ihash0[i] ^ 0x5c5c5c5c;
+      pad1[i] = ihash1[i] ^ 0x5c5c5c5c;
+   }
+   for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
+
+   sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
+                               sha256_initial_state, sha256_initial_state );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      pad0[i] = ihash0[i] ^ 0x36363636;
+      pad1[i] = ihash1[i] ^ 0x36363636;
+   }
+   for ( ; i < 16; i++ )      pad0[i] = pad1[i] = 0x36363636;
+
+   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, 
+                               sha256_initial_state, sha256_initial_state );
+}
+
+static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
+            const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
+            const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0,
+            uint32_t *output1 )
+{
+   uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8];
+   uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
+   int i, j;
+
+   sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
+                               tstate0, tstate1 );
+
+   memcpy( ibuf0, salt0 + 16, 16 );
+   memcpy( ibuf0 + 5, innerpad, 44 );
+   memcpy( obuf0 + 8, outerpad, 32 );
+   memcpy( ibuf1, salt1 + 16, 16 );
+   memcpy( ibuf1 + 5, innerpad, 44 );
+   memcpy( obuf1 + 8, outerpad, 32 );
+
+   for ( i = 0; i < 4; i++ )
+   {
+      memcpy( obuf0, istate0, 32 );
+      memcpy( obuf1, istate1, 32 );
+      ibuf0[4] = ibuf1[4] = i + 1;
+
+      sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
+                                  obuf0, obuf1 );
+      sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
+                                  ostate0, ostate1 );
+      
+      for ( j = 0; j < 8; j++ )
+      {
+         output0[ 8*i + j ] = bswap_32( ostateb0[j] );
+         output1[ 8*i + j ] = bswap_32( ostateb1[j] );
+      }
+   }
+}
+
+static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
+                    uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
+                    const uint32_t *salt0, const uint32_t *salt1,
+                    uint32_t *output0, uint32_t *output1 )
+{
+   uint32_t buf0[16], buf1[16];
+   int i;
+
+   sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
+                               tstate0, tstate1 );   
+   sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
+                               tstate0, tstate1 );
+   sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
+                               tstate0, tstate1 );
+
+   memcpy( buf0, tstate0, 32 );
+   memcpy( buf0 + 8, outerpad, 32 );
+   memcpy( buf1, tstate1, 32 );
+   memcpy( buf1 + 8, outerpad, 32 );
+
+   sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
+                               ostate0, ostate1 );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      output0[i] = bswap_32( ostate0[i] );
+      output1[i] = bswap_32( ostate1[i] );
+   }
+}
+
+
+
+#endif
+
 #ifdef HAVE_SHA256_4WAY
 
 static const uint32_t keypad_4way[4 * 12] = {
@@ -643,10 +756,10 @@ static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
            uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[8 * 8];
-   uint32_t _ALIGN(128) ostate[8 * 8];
-   uint32_t _ALIGN(128) W[8 * 32];
-   uint32_t _ALIGN(128) X[8 * 32];
+   uint32_t _ALIGN(128) tstate[ 8*8 ];
+   uint32_t _ALIGN(128) ostate[ 8*8 ];
+   uint32_t _ALIGN(128) W[ 8*32 ];
+   uint32_t _ALIGN(128) X[ 8*32 ];
    uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
@@ -658,53 +771,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
 
    dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
+   
+   if ( opt_param_n > 0x4000 )
+   {
+      scrypt_core_simd128_3buf( X,     V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+ 96, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+192, V, N );
+   }
+   else
+   {
+      intrlv_2x128( W,     X,     X+ 32, 1024 );
+      intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+      intrlv_2x128( W+128, X+128, X+160, 1024 );
+      intrlv_2x128( W+192, X+192, X+224, 1024 );
+      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+      dintrlv_2x128( X,     X+ 32, W,     1024 );
+      dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+      dintrlv_2x128( X+128, X+160, W+128, 1024 );
+      dintrlv_2x128( X+192, X+224, W+192, 1024 );
+   }
 
+      
 
    // SCRYPT CORE
 
-
-   // AVX512
-
-/*
-   // AVX512 16 way working
-   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
-                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
-                    X+256+160, X+256+192, X+256+224, 1024 );
-
-   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
-
-   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
-                  X+256, X+256+32, X+256+64, X+256+96, X+256+128, 
-                  X+256+160, X+256+192, X+256+224, W, 1024 );
-*/
-/*
-   // AVX512 working
-   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
-   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
-   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); 
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
-   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
-   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
-*/   
-/*
-   // AVX512, not working, very slow
-   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
-   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
-   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
-   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
-   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
-*/
-
   // AVX2
 
-/*
+
    // AVX2   
    // disable de/interleave for testing.
-   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
-*/
+//   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+
 
 /*
    // AVX2 working
@@ -714,23 +819,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    intrlv_2x128( W+192, X+192, X+224, 1024 );
 
    // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
 
    // working
-   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
-
-   // working
-//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
 
    dintrlv_2x128( X,     X+ 32, W,     1024 );
    dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
@@ -745,18 +845,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    intrlv_2x32( W+128, X+128, X+160, 1024 );
    intrlv_2x32( W+192, X+192, X+224, 1024 );
 
-   // working, deprecated, not up to data
-//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
-
-     // deprecated, not up to date
-//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
-
    // working
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 
 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -813,19 +905,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+192, V, N );
 */
-
+/**************
    scrypt_core_simd128_3buf( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_3buf( X+ 96, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+192, V, N );
-
-/*
-   // SSE2 working
-   scrypt_core_simd128_4buf( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+128, V, N );
-*/
+*************/
 
 
    if ( work_restart[thrid].restart ) return 0;
@@ -868,6 +954,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
                   W, 1024 );
 
 
+   if ( opt_param_n > 0x4000 )
+   {
+      scrypt_core_simd128_3buf( X,     V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+ 96, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+192, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+256, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+352, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+448, V, N );
+   }
+   else
+   {
+      intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+      intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+      intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
+      intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
+      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
+      dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+      dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+      dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
+      dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 );
+   }
+
    // SCRYPT CORE
 
 
@@ -888,23 +1007,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    // AVX512 working
    intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
    intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   intrlv_4x32( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
    scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
    dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
    dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+   dintrlv_4x32( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
+   dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
 /*
-   // AVX512, not working, very slow
+   // AVX512, working
    intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
    intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   intrlv_4x128( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
    scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+256),   (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N );
    dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
    dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+   dintrlv_4x128( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
+   dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
 
+
   // AVX2
 
 /*
@@ -919,16 +1055,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
    intrlv_2x128( W+128, X+128, X+160, 1024 );
    intrlv_2x128( W+192, X+192, X+224, 1024 );
-
-   // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+   intrlv_2x128( W+256,     X+256,     X+256+ 32, 1024 );
+   intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 );
+   intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 );
 
    // working
    scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256),      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N );
 
    // working
 //   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
@@ -938,11 +1077,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 //   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256),      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N );
 
    dintrlv_2x128( X,     X+ 32, W,     1024 );
    dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
    dintrlv_2x128( X+128, X+160, W+128, 1024 );
    dintrlv_2x128( X+192, X+224, W+192, 1024 );
+   dintrlv_2x128( X+256,     X+256+ 32, W+256,     1024 );
+   dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 );
+   dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 );
+   dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 );
 */
 
 /*
@@ -952,18 +1103,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    intrlv_2x32( W+128, X+128, X+160, 1024 );
    intrlv_2x32( W+192, X+192, X+224, 1024 );
 
-   // working, deprecated, not up to data
-//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
-
-     // deprecated, not up to date
-//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
-
    // working
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 
 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -1043,7 +1189,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+448, V, N );
 */
-
+/***************
    scrypt_core_simd128_3buf( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_3buf( X+ 96, V, N );
@@ -1055,17 +1201,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    scrypt_core_simd128_3buf( X+352, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+448, V, N );
-
-/*
-   // SSE2 working
-   scrypt_core_simd128_4buf( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+128, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+256, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+384, V, N );
-*/
+********************/
 /*
    scrypt_core_3way( X,     V, N );
    if ( work_restart[thrid].restart ) return 0;
@@ -1102,6 +1238,31 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 
 #if defined(__SHA__)
 
+static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+    uint32_t _ALIGN(128) tstate[ 2*8 ];
+    uint32_t _ALIGN(128) ostate[ 2*8 ];
+    uint32_t _ALIGN(128) W[ 2*32 ];
+    uint32_t *V = (uint32_t*)scratchpad;
+
+    memcpy( tstate,    midstate, 32 );
+    memcpy( tstate+ 8, midstate, 32 );
+
+    HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
+                                  ostate, ostate+8 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
+                                   input, input+20,  W, W+32 );
+
+    scrypt_core_simd128_2buf( W, V, N );
+    if ( work_restart[thrid].restart ) return 0;
+
+    PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
+                                   output, output+8 );
+
+   return 1;
+}
+
 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
            uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
@@ -1149,8 +1310,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
    scrypt_core_simd128( W+96, V, N );
 */
 
-   // working
-//   scrypt_core_simd128_4buf( W, V, N );
 
    if ( work_restart[thrid].restart ) return 0;
 
@@ -1171,10 +1330,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
            uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[4 * 8];
-   uint32_t _ALIGN(128) ostate[4 * 8];
-   uint32_t _ALIGN(128) W[4 * 32];
-   uint32_t _ALIGN(128) X[4 * 32];
+   uint32_t _ALIGN(128) tstate[ 4*8 ];
+   uint32_t _ALIGN(128) ostate[ 4*8 ];
+   uint32_t _ALIGN(128) W[ 4*32 ];
    uint32_t *V = (uint32_t*)scratchpad;
 
    intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
@@ -1184,7 +1342,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
    HMAC_SHA256_80_init_4way(W, tstate, ostate);
    PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
 
-   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+   if ( opt_param_n > 0x4000 )
+   {
+      uint32_t _ALIGN(128) X[ 4*32 ];
+      dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+      scrypt_core_simd128_2buf( X, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+64, V, N );
+      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+   }
+   else
+      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+
+
+
+//   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
 
 ////// SCRYPT_CORE   
 
@@ -1202,35 +1374,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128( X+96, V, N );
 */
-   
+/*   
    // working, double buffered linear simd, best for n2
    scrypt_core_simd128_2buf( X, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128_2buf( X+64, V, N );
-  
+*/  
 /*
    scrypt_core_simd128_3buf( X, V, N );
    if ( work_restart[thrid].restart ) return 0;
    scrypt_core_simd128( X+96, V, N );
 */
    
-   // working
-//   scrypt_core_simd128_4buf( X, V, N );
-
-
-/* 
-   // original
-   scrypt_core(X + 0 * 32, V, N);
-	scrypt_core(X + 1 * 32, V, N);
-	scrypt_core(X + 2 * 32, V, N);
-	scrypt_core(X + 3 * 32, V, N);
-*/
-
 ////////////////////////////////
 
    if ( work_restart[thrid].restart ) return 0;
 
-   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+//   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
 
    PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
 
@@ -1247,22 +1407,22 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
+   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
+   uint32_t midstate[8];
+   uint32_t n = pdata[19] - 1;
    int thr_id = mythr->id;  
    int throughput = scrypt_throughput;
-	int i;
+   int i;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-	for ( i = 0; i < throughput; i++ )
-		memcpy( data + i * 20, pdata, 80 );
+   for ( i = 0; i < throughput; i++ )
+      memcpy( data + i * 20, pdata, 80 );
 
    sha256_transform_le( midstate, data, sha256_initial_state );
 
-	do {
+   do {
       bool rc = true;
-		for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
       if ( throughput == 16 )
@@ -1276,7 +1436,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                                      opt_param_n, thr_id );
       else
 #endif
-      if ( throughput == 4 )
+      if ( throughput == 4 ) // slower on Ryzen than 8way
 #if defined(__SHA__)
          rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
                                          opt_param_n, thr_id );
@@ -1284,10 +1444,17 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
          rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
                                      opt_param_n, thr_id );
 #endif
+#if defined(__SHA__)
       else
+      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
+         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
+                                         opt_param_n, thr_id );
+#endif         
+      else  // should never get here
          rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
                                 opt_param_n, thr_id );
 
+      // test the hash
       if ( rc )
       for ( i = 0; i < throughput; i++ )
       {
@@ -1319,11 +1486,11 @@ bool scrypt_miner_thread_init( int thr_id )
 
 bool register_scrypt_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__)
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-#else
+//#if defined(__SHA__)
+//   gate->optimizations = SSE2_OPT | SHA_OPT;
+//#else
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
-#endif
+//#endif
    gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
    gate->scanhash         = (void*)&scanhash_scrypt;
    opt_target_factor = 65536.0;
@@ -1332,16 +1499,29 @@ bool register_scrypt_algo( algo_gate_t* gate )
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
    scrypt_throughput = 16;
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
+      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   else      
+      scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+
+/* SHA is slower than AVX2 on Ryzen
 #elif defined(__SHA__)
    scrypt_throughput = 4;
    scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+*/
+
 #elif defined(__AVX2__)
    scrypt_throughput = 8;   
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
+      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   else
+      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
    scrypt_throughput = 4;
+   if ( opt_param_n > 0x4000 )
    scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+   else
+   scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
 #endif
 
    char t_units[4] = {0};
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 7b6618c4..de3f1d43 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -51,7 +51,6 @@ typedef struct {
    __m128i buf[64>>2];
    __m128i val[8];
    uint32_t count_high, count_low;
-   bool initialized;
 } sha256_4way_context __attribute__ ((aligned (64)));
 
 void sha256_4way_init( sha256_4way_context *sc );
@@ -74,7 +73,6 @@ typedef struct {
    __m256i buf[64>>2];
    __m256i val[8];
    uint32_t count_high, count_low;
-   bool initialized;
 } sha256_8way_context __attribute__ ((aligned (128)));
 
 void sha256_8way_init( sha256_8way_context *sc );
@@ -96,7 +94,6 @@ typedef struct {
    __m512i buf[64>>2];
    __m512i val[8];
    uint32_t count_high, count_low;
-   bool initialized;
 } sha256_16way_context __attribute__ ((aligned (128)));
 
 void sha256_16way_init( sha256_16way_context *sc );
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index beac702c..1c630cc8 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -107,22 +107,19 @@ do { \
 } while (0)
 
 // LE data, no need to byte swap
-void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
-                            const __m128i *state_in )
+static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
+                                          const __m128i *in )
 {
    __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
-   __m128i W[16];
-
-   memcpy_128( W, data, 16 );
 
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
+   A = in[0];
+   B = in[1];
+   C = in[2];
+   D = in[3];
+   E = in[4];
+   F = in[5];
+   G = in[6];
+   H = in[7];
    Y_xor_Z = _mm_xor_si128( B, C );
 
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
@@ -179,228 +176,46 @@ void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
       SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
    }
    
-   state_out[0] = _mm_add_epi32( state_in[0], A );
-   state_out[1] = _mm_add_epi32( state_in[1], B );
-   state_out[2] = _mm_add_epi32( state_in[2], C );
-   state_out[3] = _mm_add_epi32( state_in[3], D );
-   state_out[4] = _mm_add_epi32( state_in[4], E );
-   state_out[5] = _mm_add_epi32( state_in[5], F );
-   state_out[6] = _mm_add_epi32( state_in[6], G );
-   state_out[7] = _mm_add_epi32( state_in[7], H );
+   out[0] = _mm_add_epi32( in[0], A );
+   out[1] = _mm_add_epi32( in[1], B );
+   out[2] = _mm_add_epi32( in[2], C );
+   out[3] = _mm_add_epi32( in[3], D );
+   out[4] = _mm_add_epi32( in[4], E );
+   out[5] = _mm_add_epi32( in[5], F );
+   out[6] = _mm_add_epi32( in[6], G );
+   out[7] = _mm_add_epi32( in[7], H );
 }
 
-// BE data, need to byte swap
-void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
-                            const __m128i *state_in )
+// LE data, no need to byte swap
+void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
+                               const __m128i *state_in )
 {
-   __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m128i W[16];
-
-   mm128_block_bswap_32( W, data );
-   mm128_block_bswap_32( W+8, data+8 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
-   Y_xor_Z = _mm_xor_si128( B, C );
-
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   state_out[0] = _mm_add_epi32( state_in[0], A );
-   state_out[1] = _mm_add_epi32( state_in[1], B );
-   state_out[2] = _mm_add_epi32( state_in[2], C );
-   state_out[3] = _mm_add_epi32( state_in[3], D );
-   state_out[4] = _mm_add_epi32( state_in[4], E );
-   state_out[5] = _mm_add_epi32( state_in[5], F );
-   state_out[6] = _mm_add_epi32( state_in[6], G );
-   state_out[7] = _mm_add_epi32( state_in[7], H );
+   memcpy_128( W, data, 16 );
+   SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
-
-static void
-sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
+// BE data, need to byte swap input data
+void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
+                               const __m128i *state_in )
 {
-   register  __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
    __m128i W[16];
-
-   mm128_block_bswap_32( W, in );
-   mm128_block_bswap_32( W+8, in+8 );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m128_const1_64( 0x6A09E6676A09E667 );
-      B = m128_const1_64( 0xBB67AE85BB67AE85 );
-      C = m128_const1_64( 0x3C6EF3723C6EF372 );
-      D = m128_const1_64( 0xA54FF53AA54FF53A );
-      E = m128_const1_64( 0x510E527F510E527F );
-      F = m128_const1_64( 0x9B05688C9B05688C );
-      G = m128_const1_64( 0x1F83D9AB1F83D9AB );
-      H = m128_const1_64( 0x5BE0CD195BE0CD19 );
-   }
-
-   Y_xor_Z = _mm_xor_si128( B, C );
-
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm_add_epi32( r[0], A );
-      r[1] = _mm_add_epi32( r[1], B );
-      r[2] = _mm_add_epi32( r[2], C );
-      r[3] = _mm_add_epi32( r[3], D );
-      r[4] = _mm_add_epi32( r[4], E );
-      r[5] = _mm_add_epi32( r[5], F );
-      r[6] = _mm_add_epi32( r[6], G );
-      r[7] = _mm_add_epi32( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) );
-      r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) );
-      r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) );
-      r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) );
-      r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) );
-      r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) );
-      r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) );
-      r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) );
-   }
+   mm128_block_bswap_32( W, data );
+   mm128_block_bswap_32( W+8, data+8 );
+   SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
 void sha256_4way_init( sha256_4way_context *sc )
 {
-   sc->initialized = false;
    sc->count_high = sc->count_low = 0;
-/*
-   sc->val[0] = _mm_set1_epi32( H256[0] );
-   sc->val[1] = _mm_set1_epi32( H256[1] );
-   sc->val[2] = _mm_set1_epi32( H256[2] );
-   sc->val[3] = _mm_set1_epi32( H256[3] );
-   sc->val[4] = _mm_set1_epi32( H256[4] );
-   sc->val[5] = _mm_set1_epi32( H256[5] );
-   sc->val[6] = _mm_set1_epi32( H256[6] );
-   sc->val[7] = _mm_set1_epi32( H256[7] );
-*/
+   sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m128_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m128_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
 void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -424,7 +239,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
       len -= clen;
       if ( ptr == buf_size )
       {
-         sha256_4way_round( sc, sc->buf, sc->val );
+         sha256_4way_transform_be( sc->val, sc->buf, sc->val );
          ptr = 0;
       }
       clow = sc->count_low;
@@ -449,7 +264,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
     if ( ptr > pad )
     {
          memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_4way_round( sc, sc->buf, sc->val );
+         sha256_4way_transform_be( sc->val, sc->buf, sc->val );
          memset_zero_128( sc->buf, pad >> 2 );
     }
     else
@@ -461,7 +276,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
 
     sc->buf[  pad     >> 2 ] = m128_const1_32( bswap_32( high ) );
     sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
-    sha256_4way_round( sc, sc->buf, sc->val );
+    sha256_4way_transform_be( sc->val, sc->buf, sc->val );
 
     mm128_block_bswap_32( dst, sc->val );
 }
@@ -539,8 +354,7 @@ do { \
 
 #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
-                                 W[ i ] ); \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
   __m256i T1 = BSG2_1x( E ); \
   __m256i T2 = BSG2_0x( A ); \
   T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
@@ -552,45 +366,74 @@ do { \
   H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
 
-/*
-#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+//  the X_xor_y technique can be extended to eliminate the mov instruction.
+//  Perform double rounds and alternate each round. Doesn't apply to AVX512
+//  and isn't suitable for running 3 round prehash.
+//
+// read Y_xor_Z, update X_xor_Y
+#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
+  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
+                                         Y_xor_Z ) )
+
+// start with toc initialized to y^z:   toc = B ^ C
+// First round reads toc as Y_xor_Z and saves X_xor_Y as tic.
+// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc.
+
+#define SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, i0, i1, j ) \
 do { \
-  __m256i T1, T2; \
-  __m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \
-  T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
-                                           K, W[i] ) ); \
-  T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
-  Y_xor_Z = X_xor_Y; \
+  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
+                                 W[ i0 ] ); \
+  __m256i T1 = BSG2_1x( E ); \
+  __m256i T2 = BSG2_0x( A ); \
+  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
+  T1 = _mm256_add_epi32( T1, H ); \
+  T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
+\
+  T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
+                                 W[ (i1) ] ); \
+  T1 = BSG2_1x( D ); \
+  T2 = BSG2_0x( H ); \
+  T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \
+  T1 = _mm256_add_epi32( T1, G ); \
+  T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \
+  T1 = _mm256_add_epi32( T1, T0 ); \
+  C  = _mm256_add_epi32( C,  T1 ); \
+  G  = _mm256_add_epi32( T1, T2 ); \
 } while (0)
-*/
 
 #endif   // AVX512VL else AVX2
 
-// accepts LE byte ordered data, skip the byte swap
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
-                            const __m256i *state_in )
+static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
+                                          const  __m256i *in ) \
 {
    __m256i A, B, C, D, E, F, G, H;
-#if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z;
-#endif
-   __m256i W[16];
-   memcpy_256( W, data, 16 );
 
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
+   A = _mm256_load_si256( in   );
+   B = _mm256_load_si256( in+1 );
+   C = _mm256_load_si256( in+2 );
+   D = _mm256_load_si256( in+3 );
+   E = _mm256_load_si256( in+4 );
+   F = _mm256_load_si256( in+5 );
+   G = _mm256_load_si256( in+6 );
+   H = _mm256_load_si256( in+7 );
 
 #if !defined(__AVX512VL__)
-   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif
+
+   __m256i tic, toc = _mm256_xor_si256( B, C );
+
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, 0 );
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, 0 );
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, 0 );
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, 0 );
+   SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, 0 );
+   SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, 0 );
+   SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, 0 );
+   SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, 0 );
+
+#else
 
    SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -609,6 +452,8 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
    SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
    SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 
+#endif
+
    for ( int j = 16; j < 64; j += 16 )
    {
       W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
@@ -628,6 +473,19 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
       W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
       W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
 
+#if !defined(__AVX512VL__)
+
+      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  0,  1, j );
+      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F,  2,  3, j );
+      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D,  4,  5, j );
+      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B,  6,  7, j );
+      SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H,  8,  9, j );
+      SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j );
+      SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j );
+      SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j );
+
+#else
+      
       SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
       SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
       SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
@@ -644,244 +502,52 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
       SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
       SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
       SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+
+#endif      
    }
 
-   state_out[0] = _mm256_add_epi32( state_in[0], A );
-   state_out[1] = _mm256_add_epi32( state_in[1], B );
-   state_out[2] = _mm256_add_epi32( state_in[2], C );
-   state_out[3] = _mm256_add_epi32( state_in[3], D );
-   state_out[4] = _mm256_add_epi32( state_in[4], E );
-   state_out[5] = _mm256_add_epi32( state_in[5], F );
-   state_out[6] = _mm256_add_epi32( state_in[6], G );
-   state_out[7] = _mm256_add_epi32( state_in[7], H );
+   out[0] = _mm256_add_epi32( in[0], A );
+   out[1] = _mm256_add_epi32( in[1], B );
+   out[2] = _mm256_add_epi32( in[2], C );
+   out[3] = _mm256_add_epi32( in[3], D );
+   out[4] = _mm256_add_epi32( in[4], E );
+   out[5] = _mm256_add_epi32( in[5], F );
+   out[6] = _mm256_add_epi32( in[6], G );
+   out[7] = _mm256_add_epi32( in[7], H );
 }
 
-
-// Accepts BE byte ordered data, need to byte swap
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+// accepts LE input data
+void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
                                const __m256i *state_in )
 {
-   __m256i A, B, C, D, E, F, G, H;
-#if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z;
-#endif
    __m256i W[16];
-
-   mm256_block_bswap_32( W  , data   );
-   mm256_block_bswap_32( W+8, data+8 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
-
-#if !defined(__AVX512VL__)
-   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif
-   
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   state_out[0] = _mm256_add_epi32( state_in[0], A );
-   state_out[1] = _mm256_add_epi32( state_in[1], B );
-   state_out[2] = _mm256_add_epi32( state_in[2], C );
-   state_out[3] = _mm256_add_epi32( state_in[3], D );
-   state_out[4] = _mm256_add_epi32( state_in[4], E );
-   state_out[5] = _mm256_add_epi32( state_in[5], F );
-   state_out[6] = _mm256_add_epi32( state_in[6], G );
-   state_out[7] = _mm256_add_epi32( state_in[7], H );
+   memcpy_256( W, data, 16 );
+   SHA256_8WAY_TRANSFORM( state_out, W, state_in );
 }
 
-static void
-sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] )
+// Accepts BE input data, need to bswap
+void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+                               const __m256i *state_in )
 {
-   register  __m256i A, B, C, D, E, F, G, H;
-#if !defined(__AVX512VL__)
-   __m256i X_xor_Y, Y_xor_Z;
-#endif
    __m256i W[16];
-
-   mm256_block_bswap_32( W  , in   );
-   mm256_block_bswap_32( W+8, in+8 );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m256_const1_64( 0x6A09E6676A09E667 );
-      B = m256_const1_64( 0xBB67AE85BB67AE85 );
-      C = m256_const1_64( 0x3C6EF3723C6EF372 );
-      D = m256_const1_64( 0xA54FF53AA54FF53A );
-      E = m256_const1_64( 0x510E527F510E527F );
-      F = m256_const1_64( 0x9B05688C9B05688C );
-      G = m256_const1_64( 0x1F83D9AB1F83D9AB );
-      H = m256_const1_64( 0x5BE0CD195BE0CD19 );
-   }
-
-#if !defined(__AVX512VL__)
-   Y_xor_Z = _mm256_xor_si256( B, C );
-#endif
-   
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm256_add_epi32( r[0], A );
-      r[1] = _mm256_add_epi32( r[1], B );
-      r[2] = _mm256_add_epi32( r[2], C );
-      r[3] = _mm256_add_epi32( r[3], D );
-      r[4] = _mm256_add_epi32( r[4], E );
-      r[5] = _mm256_add_epi32( r[5], F );
-      r[6] = _mm256_add_epi32( r[6], G );
-      r[7] = _mm256_add_epi32( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) );
-      r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) );
-      r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) );
-      r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) );
-      r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) );
-      r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) );
-      r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) );
-      r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) );
-   }
+   mm256_block_bswap_32( W  , data   );
+   mm256_block_bswap_32( W+8, data+8 );
+   SHA256_8WAY_TRANSFORM( state_out, W, state_in );
 }
 
 void sha256_8way_init( sha256_8way_context *sc )
 {
-   sc->initialized = false;
    sc->count_high = sc->count_low = 0;
-/*
-   sc->val[0] = _mm256_set1_epi32( H256[0] );
-   sc->val[1] = _mm256_set1_epi32( H256[1] );
-   sc->val[2] = _mm256_set1_epi32( H256[2] );
-   sc->val[3] = _mm256_set1_epi32( H256[3] );
-   sc->val[4] = _mm256_set1_epi32( H256[4] );
-   sc->val[5] = _mm256_set1_epi32( H256[5] );
-   sc->val[6] = _mm256_set1_epi32( H256[6] );
-   sc->val[7] = _mm256_set1_epi32( H256[7] );
-*/
+   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
-
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.
 
@@ -906,7 +572,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
       len -= clen;
       if ( ptr == buf_size )
       {
-         sha256_8way_round( sc, sc->buf, sc->val );
+         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
          ptr = 0;
       }
       clow = sc->count_low;
@@ -931,7 +597,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     if ( ptr > pad )
     {
          memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_8way_round( sc, sc->buf, sc->val );
+         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
          memset_zero_256( sc->buf, pad >> 2 );
     }
     else
@@ -944,7 +610,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     sc->buf[   pad     >> 2 ] = m256_const1_32( bswap_32( high ) );
     sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
 
-    sha256_8way_round( sc, sc->buf, sc->val );
+    sha256_8way_transform_be( sc->val, sc->buf, sc->val );
 
     mm256_block_bswap_32( dst, sc->val );
 }
@@ -986,8 +652,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )
 
 #define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \
-                                 W[ i ] ); \
+  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
   __m512i T1 = BSG2_1x16( E ); \
   __m512i T2 = BSG2_0x16( A ); \
   T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
@@ -1011,23 +676,19 @@ do { \
 } while (0)
 */
 
-// accepts LE input data
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
-                             const __m512i *state_in )
+
+static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
+                                           const  __m512i *in ) \
 {
    __m512i A, B, C, D, E, F, G, H;
-   __m512i W[16];
-
-   memcpy_512( W, data, 16 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
+   A = _mm512_load_si512( in   );
+   B = _mm512_load_si512( in+1 );
+   C = _mm512_load_si512( in+2 );
+   D = _mm512_load_si512( in+3 );
+   E = _mm512_load_si512( in+4 );
+   F = _mm512_load_si512( in+5 );
+   G = _mm512_load_si512( in+6 );
+   H = _mm512_load_si512( in+7 );
 
    SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -1083,100 +744,36 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
       SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
    }
 
-   state_out[0] = _mm512_add_epi32( state_in[0], A );
-   state_out[1] = _mm512_add_epi32( state_in[1], B );
-   state_out[2] = _mm512_add_epi32( state_in[2], C );
-   state_out[3] = _mm512_add_epi32( state_in[3], D );
-   state_out[4] = _mm512_add_epi32( state_in[4], E );
-   state_out[5] = _mm512_add_epi32( state_in[5], F );
-   state_out[6] = _mm512_add_epi32( state_in[6], G );
-   state_out[7] = _mm512_add_epi32( state_in[7], H );
+   out[0] = _mm512_add_epi32( in[0], A );
+   out[1] = _mm512_add_epi32( in[1], B );
+   out[2] = _mm512_add_epi32( in[2], C );
+   out[3] = _mm512_add_epi32( in[3], D );
+   out[4] = _mm512_add_epi32( in[4], E );
+   out[5] = _mm512_add_epi32( in[5], F );
+   out[6] = _mm512_add_epi32( in[6], G );
+   out[7] = _mm512_add_epi32( in[7], H );
+}
+
+// accepts LE input data
+void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+                                const __m512i *state_in )
+{
+   __m512i W[16];
+   memcpy_512( W, data, 16 );
+   SHA256_16WAY_TRANSFORM( state_out, W, state_in );
 }
 
 // Accepts BE input data, need to bswap
 void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
                                 const __m512i *state_in )
 {
-   __m512i A, B, C, D, E, F, G, H;
    __m512i W[16];
-
    mm512_block_bswap_32( W  , data   );
    mm512_block_bswap_32( W+8, data+8 );
-
-   A = state_in[0];
-   B = state_in[1];
-   C = state_in[2];
-   D = state_in[3];
-   E = state_in[4];
-   F = state_in[5];
-   G = state_in[6];
-   H = state_in[7];
-
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   state_out[0] = _mm512_add_epi32( state_in[0], A );
-   state_out[1] = _mm512_add_epi32( state_in[1], B );
-   state_out[2] = _mm512_add_epi32( state_in[2], C );
-   state_out[3] = _mm512_add_epi32( state_in[3], D );
-   state_out[4] = _mm512_add_epi32( state_in[4], E );
-   state_out[5] = _mm512_add_epi32( state_in[5], F );
-   state_out[6] = _mm512_add_epi32( state_in[6], G );
-   state_out[7] = _mm512_add_epi32( state_in[7], H );
+   SHA256_16WAY_TRANSFORM( state_out, W, state_in );
 }
-
-// Aggresive prehashing
+ 
+// Aggresive prehashing, LE byte order
 void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W,
                              const __m512i *state_in )
 {
@@ -1295,125 +892,19 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
    _mm512_store_si512( state_out + 7,  H );
 }
 
-static void
-sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
-{
-   register __m512i A, B, C, D, E, F, G, H;
-   __m512i W[16];
-
-   mm512_block_bswap_32( W  , in   );
-   mm512_block_bswap_32( W+8, in+8 );
-
-   if ( ctx->initialized )
-   {
-      A = r[0];
-      B = r[1];
-      C = r[2];
-      D = r[3];
-      E = r[4];
-      F = r[5];
-      G = r[6];
-      H = r[7];
-   }
-   else
-   {
-      A = m512_const1_64( 0x6A09E6676A09E667 );
-      B = m512_const1_64( 0xBB67AE85BB67AE85 );
-      C = m512_const1_64( 0x3C6EF3723C6EF372 );
-      D = m512_const1_64( 0xA54FF53AA54FF53A );
-      E = m512_const1_64( 0x510E527F510E527F );
-      F = m512_const1_64( 0x9B05688C9B05688C );
-      G = m512_const1_64( 0x1F83D9AB1F83D9AB );
-      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
-   }
-
-
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
-   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
-   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
-   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
-   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
-   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
-   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
-   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
-   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
-
-   for ( int j = 16; j < 64; j += 16 )
-   {
-      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
-      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
-      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
-      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
-      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
-      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
-      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
-      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
-      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
-      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
-      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
-      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
-      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
-      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
-      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
-      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
-
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
-      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
-      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
-      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
-      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
-      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
-      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
-      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
-      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
-   }
-
-   if ( ctx->initialized )
-   {
-      r[0] = _mm512_add_epi32( r[0], A );
-      r[1] = _mm512_add_epi32( r[1], B );
-      r[2] = _mm512_add_epi32( r[2], C );
-      r[3] = _mm512_add_epi32( r[3], D );
-      r[4] = _mm512_add_epi32( r[4], E );
-      r[5] = _mm512_add_epi32( r[5], F );
-      r[6] = _mm512_add_epi32( r[6], G );
-      r[7] = _mm512_add_epi32( r[7], H );
-   }
-   else
-   {
-      ctx->initialized = true;
-      r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
-      r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
-      r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
-      r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
-      r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
-      r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
-      r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
-      r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
-   }
-}
-
 void sha256_16way_init( sha256_16way_context *sc )
 {
-   sc->initialized = false;
    sc->count_high = sc->count_low = 0;
+   sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 );
+   sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A );
+   sc->val[4] = m512_const1_64( 0x510E527F510E527F );
+   sc->val[5] = m512_const1_64( 0x9B05688C9B05688C );
+   sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
 }
 
-
 void sha256_16way_update( sha256_16way_context *sc, const void *data,
                            size_t len )
 {
@@ -1436,7 +927,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
       len -= clen;
       if ( ptr == buf_size )
       {
-         sha256_16way_round( sc, sc->buf, sc->val );
+         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
          ptr = 0;
       }
       clow = sc->count_low;
@@ -1461,7 +952,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
     if ( ptr > pad )
     {
          memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_16way_round( sc, sc->buf, sc->val );
+         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
          memset_zero_512( sc->buf, pad >> 2 );
     }
     else
@@ -1474,7 +965,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
     sc->buf[   pad     >> 2 ] = m512_const1_32( bswap_32( high ) );
     sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
 
-    sha256_16way_round( sc, sc->buf, sc->val );
+    sha256_16way_transform_be( sc->val, sc->buf, sc->val );
 
     mm512_block_bswap_32( dst, sc->val );
 }
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index c53cb39f..8225595b 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -70,6 +70,8 @@ extern "C"{
            C8, C9, CA, CB, CC, CD, CE, CF; \
    __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
            M8, M9, MA, MB, MC, MD, ME, MF; \
+   const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
+   const __m256i THREE = _mm256_set1_epi32( 3 ); \
    sph_u32 Wlow, Whigh;
 
 #define READ_STATE8(state) do \
@@ -314,8 +316,7 @@ do { \
             _mm256_andnot_si256( xb3, xb2 ), \
             _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
                _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   _mm256_set1_epi32(5UL) ) ), \
-               _mm256_set1_epi32(3UL) ) ) ); \
+                                   FIVE ) ), THREE ) ) ); \
    xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
 
@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        C8, C9, CA, CB, CC, CD, CE, CF; \
 	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
-	sph_u32 Wlow, Whigh;
+   const __m128i FIVE  = _mm_set1_epi32( 5 ); \
+   const __m128i THREE = _mm_set1_epi32( 3 ); \
+   sph_u32 Wlow, Whigh;
 
 #define READ_STATE(state) do \
 { \
@@ -931,8 +934,8 @@ do { \
    xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
             _mm_andnot_si128( xb3, xb2 ), \
             _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
-                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), FIVE ) \
+                   ) ), THREE ) ) ) ); \
    xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)
 
diff --git a/algo/swifftx/inttypes.h b/algo/swifftx/inttypes.h
index 2b6b941b..9f74eee2 100644
--- a/algo/swifftx/inttypes.h
+++ b/algo/swifftx/inttypes.h
@@ -18,16 +18,20 @@
  #ifndef __INTTYPES_H_
  #define __INTTYPES_H_
 
+#include <stdint.h>
+
  /* Use [u]intN_t if you need exactly N bits.
   XXX - doesn't handle the -mint8 option.  */
 
  typedef signed char swift_int8_t;
  typedef unsigned char swift_uint8_t;
 
- typedef int swift_int16_t;
+ typedef int32_t swift_int16_t;
+// typedef int swift_int16_t;
  typedef unsigned int swift_uint16_t;
 
- typedef long swift_int32_t;
+ typedef int32_t swift_int32_t;
+// typedef long swift_int32_t;
  typedef unsigned long swift_uint32_t;
 
  typedef long long swift_int64_t;
diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c
index f38ea854..d3ecd15c 100644
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -18,6 +18,8 @@
 //#include "stdbool.h"
 #include <memory.h>
 
+#include "simd-utils.h"
+
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // Constants and static tables portion.
 ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -49,20 +51,20 @@
 // - A: the first operand. After the operation stores the sum of the two operands.
 // - B: the second operand. After the operation stores the difference between the first and the
 //   second operands.
-#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
+//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
 
 // Quickly reduces an integer modulo 257.
 //
 // Parameters:
 // - A: the input.
-#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
+//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
 
 // Since we need to do the setup only once, this is the indicator variable:
 static bool wasSetupDone = false;
 
 // This array stores the powers of omegas that correspond to the indices, which are the input
 // values. Known also as the "outer FFT twiddle factors".
-swift_int16_t multipliers[N];
+swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
 
 // This array stores the powers of omegas, multiplied by the corresponding values.
 // We store this table to save computation time.
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
 // compression function, i is between 0 and 31, x_i is a 64-bit value.
 // One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
 // formula (2), section 3, page 6.
-swift_int16_t fftTable[256 * EIGHTH_N];
+swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
 
 // The A's we use in SWIFFTX shall be random elements of Z_257.
 // We generated these A's from the decimal expansion of PI as follows:  we converted each
 // triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
 // element, otherwise move to the next triple of digits in the expansion. This guarntees that
 // the A's are random, provided that PI digits are.
-const swift_int16_t As[3 * M * N] =
+const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
 {141,  78, 139,  75, 238, 205, 129, 126,  22, 245, 197, 169, 142, 118, 105,  78,
   50, 149,  29, 208, 114,  34,  85, 117,  67, 148,  86, 256,  25,  49, 133,  93,
   95,  36,  68, 231, 211, 102, 151, 128, 224, 117, 193,  27, 102, 187,   7, 105,
@@ -636,9 +638,202 @@ void InitializeSWIFFTX()
 	wasSetupDone = true;
 }
 
+// In the original code the F matrix is rotated so it was not aranged
+// the same as all the other data. Rearanging F to match all the other
+// data made vectorizing possible, the compiler probably could have been
+// able to auto-vectorize with proper data organisation.
+// Also in the original code the custom 16 bit data types are all now 32
+// bit int32_t regardless of the type name.
+//
 void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 {
-	swift_int16_t *mult = multipliers;
+#if defined(__AVX2__)
+
+   __m256i F[8] __attribute__ ((aligned (64)));
+   __m256i *mul = (__m256i*)multipliers;
+   __m256i *out = (__m256i*)output;
+   __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
+
+   F[0] = _mm256_mullo_epi32( mul[0], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
+   F[1] = _mm256_mullo_epi32( mul[1], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
+   F[2] = _mm256_mullo_epi32( mul[2], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
+   F[3] = _mm256_mullo_epi32( mul[3], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
+   F[4] = _mm256_mullo_epi32( mul[4], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
+   F[5] = _mm256_mullo_epi32( mul[5], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
+   F[6] = _mm256_mullo_epi32( mul[6], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
+   F[7] = _mm256_mullo_epi32( mul[7], *tbl );
+
+   #define ADD_SUB( a, b ) \
+   { \
+      __m256i tmp = b; \
+      b = _mm256_sub_epi32( a, b ); \
+      a = _mm256_add_epi32( a, tmp ); \
+   }
+   
+   ADD_SUB( F[0], F[1] );
+   ADD_SUB( F[2], F[3] );
+   ADD_SUB( F[4], F[5] );
+   ADD_SUB( F[6], F[7] );
+
+   F[3] = _mm256_slli_epi32( F[3], 4 );
+   F[7] = _mm256_slli_epi32( F[7], 4 );
+
+   ADD_SUB( F[0], F[2] );
+   ADD_SUB( F[1], F[3] );
+   ADD_SUB( F[4], F[6] );
+   ADD_SUB( F[5], F[7] );  
+
+   F[5] = _mm256_slli_epi32( F[5], 2 );
+   F[6] = _mm256_slli_epi32( F[6], 4 );
+   F[7] = _mm256_slli_epi32( F[7], 6 );
+
+   ADD_SUB( F[0], F[4] );
+   ADD_SUB( F[1], F[5] );
+   ADD_SUB( F[2], F[6] );
+   ADD_SUB( F[3], F[7] );
+
+   #undef ADD_SUB
+
+#if defined (__AVX512VL__) && defined(__AVX512BW__)   
+
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_and_si256( a, \
+                 _mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) ) 
+
+#else   
+
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_and_si256( a, \
+                   m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) ) 
+
+#endif
+                          
+   out[0] = Q_REDUCE( F[0] );  
+   out[1] = Q_REDUCE( F[1] );                        
+   out[2] = Q_REDUCE( F[2] );                        
+   out[3] = Q_REDUCE( F[3] );                        
+   out[4] = Q_REDUCE( F[4] );                        
+   out[5] = Q_REDUCE( F[5] );                        
+   out[6] = Q_REDUCE( F[6] );                        
+   out[7] = Q_REDUCE( F[7] );
+
+   #undef Q_REDUCE
+
+#elif defined(__SSE4_1__)
+
+   __m128i F[16] __attribute__ ((aligned (64)));
+   __m128i *mul = (__m128i*)multipliers;
+   __m128i *out = (__m128i*)output;
+   __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
+
+   F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
+   F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
+   F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
+   F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
+   F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
+   F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
+   F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
+   F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
+   F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
+   F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
+   F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
+   F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
+   F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
+   F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
+   F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
+   F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
+
+   #define ADD_SUB( a, b ) \
+   { \
+      __m128i tmp = b; \
+      b = _mm_sub_epi32( a, b ); \
+      a = _mm_add_epi32( a, tmp ); \
+   }
+
+   ADD_SUB( F[ 0], F[ 2] );
+   ADD_SUB( F[ 1], F[ 3] );
+   ADD_SUB( F[ 4], F[ 6] );
+   ADD_SUB( F[ 5], F[ 7] );
+   ADD_SUB( F[ 8], F[10] );
+   ADD_SUB( F[ 9], F[11] );
+   ADD_SUB( F[12], F[14] );
+   ADD_SUB( F[13], F[15] );
+
+   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
+   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
+   F[14] = _mm_slli_epi32( F[14], 4 );
+   F[15] = _mm_slli_epi32( F[15], 4 );
+
+   ADD_SUB( F[ 0], F[ 4] );
+   ADD_SUB( F[ 1], F[ 5] );
+   ADD_SUB( F[ 2], F[ 6] );
+   ADD_SUB( F[ 3], F[ 7] );
+   ADD_SUB( F[ 8], F[12] );
+   ADD_SUB( F[ 9], F[13] );
+   ADD_SUB( F[10], F[14] );
+   ADD_SUB( F[11], F[15] );
+
+   F[10] = _mm_slli_epi32( F[10], 2 );
+   F[11] = _mm_slli_epi32( F[11], 2 );
+   F[12] = _mm_slli_epi32( F[12], 4 );
+   F[13] = _mm_slli_epi32( F[13], 4 );
+   F[14] = _mm_slli_epi32( F[14], 6 );
+   F[15] = _mm_slli_epi32( F[15], 6 );
+   
+   ADD_SUB( F[ 0], F[ 8] );
+   ADD_SUB( F[ 1], F[ 9] );
+   ADD_SUB( F[ 2], F[10] );
+   ADD_SUB( F[ 3], F[11] );
+   ADD_SUB( F[ 4], F[12] );
+   ADD_SUB( F[ 5], F[13] );
+   ADD_SUB( F[ 6], F[14] );
+   ADD_SUB( F[ 7], F[15] );
+
+   #undef ADD_SUB
+
+   #define Q_REDUCE( a ) \
+      _mm_sub_epi32( _mm_and_si128( a, \
+                   m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) ) 
+
+   out[ 0] = Q_REDUCE( F[ 0] );
+   out[ 1] = Q_REDUCE( F[ 1] );
+   out[ 2] = Q_REDUCE( F[ 2] );
+   out[ 3] = Q_REDUCE( F[ 3] );
+   out[ 4] = Q_REDUCE( F[ 4] );
+   out[ 5] = Q_REDUCE( F[ 5] );
+   out[ 6] = Q_REDUCE( F[ 6] );
+   out[ 7] = Q_REDUCE( F[ 7] );
+   out[ 8] = Q_REDUCE( F[ 8] );
+   out[ 9] = Q_REDUCE( F[ 9] );
+   out[10] = Q_REDUCE( F[10] );
+   out[11] = Q_REDUCE( F[11] );
+   out[12] = Q_REDUCE( F[12] );
+   out[13] = Q_REDUCE( F[13] );
+   out[14] = Q_REDUCE( F[14] );
+   out[15] = Q_REDUCE( F[15] );
+
+   #undef Q_REDUCE
+
+#else   // < SSE4.1
+   
+   swift_int16_t *mult = multipliers;
+
+   // First loop unrolling:
+	register swift_int16_t *table = &(fftTable[input[0] << 3]);
 
 /*
    swift_int32_t F[64];
@@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
                 F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
                 F60, F61, F62, F63;
    
-   // First loop unrolling:
-	register swift_int16_t *table = &(fftTable[input[0] << 3]);
-
-	F0 = mult[0] * table[0];
-	F8 = mult[1] * table[1];
+	F0  = mult[0] * table[0];
+	F8  = mult[1] * table[1];
 	F16 = mult[2] * table[2];
 	F24 = mult[3] * table[3];
 	F32 = mult[4] * table[4];
@@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	F48 = mult[6] * table[6];
 	F56 = mult[7] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[1] << 3]);
 
-	F1 = mult[0] * table[0];
-	F9 = mult[1] * table[1];
-	F17 = mult[2] * table[2];
-	F25 = mult[3] * table[3];
-	F33 = mult[4] * table[4];
-	F41 = mult[5] * table[5];
-	F49 = mult[6] * table[6];
-	F57 = mult[7] * table[7];
+	F1  = mult[ 8] * table[0];
+	F9  = mult[ 9] * table[1];
+	F17 = mult[10] * table[2];
+	F25 = mult[11] * table[3];
+	F33 = mult[12] * table[4];
+	F41 = mult[13] * table[5];
+	F49 = mult[14] * table[6];
+	F57 = mult[15] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[2] << 3]);
 
-	F2 = mult[0] * table[0];
-	F10 = mult[1] * table[1];
-	F18 = mult[2] * table[2];
-	F26 = mult[3] * table[3];
-	F34 = mult[4] * table[4];
-	F42 = mult[5] * table[5];
-	F50 = mult[6] * table[6];
-	F58 = mult[7] * table[7];
+	F2  = mult[16] * table[0];
+	F10 = mult[17] * table[1];
+	F18 = mult[18] * table[2];
+	F26 = mult[19] * table[3];
+	F34 = mult[20] * table[4];
+	F42 = mult[21] * table[5];
+	F50 = mult[22] * table[6];
+	F58 = mult[23] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[3] << 3]);
 
-	F3 = mult[0] * table[0];
-	F11 = mult[1] * table[1];
-	F19 = mult[2] * table[2];
-	F27 = mult[3] * table[3];
-	F35 = mult[4] * table[4];
-	F43 = mult[5] * table[5];
-	F51 = mult[6] * table[6];
-	F59 = mult[7] * table[7];
+	F3  = mult[24] * table[0];
+	F11 = mult[25] * table[1];
+	F19 = mult[26] * table[2];
+	F27 = mult[27] * table[3];
+	F35 = mult[28] * table[4];
+	F43 = mult[29] * table[5];
+	F51 = mult[30] * table[6];
+	F59 = mult[31] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[4] << 3]);
 
-	F4 = mult[0] * table[0];
-	F12 = mult[1] * table[1];
-	F20 = mult[2] * table[2];
-	F28 = mult[3] * table[3];
-	F36 = mult[4] * table[4];
-	F44 = mult[5] * table[5];
-	F52 = mult[6] * table[6];
-	F60 = mult[7] * table[7];
+	F4  = mult[32] * table[0];
+	F12 = mult[33] * table[1];
+	F20 = mult[34] * table[2];
+	F28 = mult[35] * table[3];
+	F36 = mult[36] * table[4];
+	F44 = mult[37] * table[5];
+	F52 = mult[38] * table[6];
+	F60 = mult[39] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[5] << 3]);
 
-	F5 = mult[0] * table[0];
-	F13 = mult[1] * table[1];
-	F21 = mult[2] * table[2];
-	F29 = mult[3] * table[3];
-	F37 = mult[4] * table[4];
-	F45 = mult[5] * table[5];
-	F53 = mult[6] * table[6];
-	F61 = mult[7] * table[7];
+	F5  = mult[40] * table[0];
+	F13 = mult[41] * table[1];
+	F21 = mult[42] * table[2];
+	F29 = mult[43] * table[3];
+	F37 = mult[44] * table[4];
+	F45 = mult[45] * table[5];
+	F53 = mult[46] * table[6];
+	F61 = mult[47] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[6] << 3]);
 
-	F6 = mult[0] * table[0];
-	F14 = mult[1] * table[1];
-	F22 = mult[2] * table[2];
-	F30 = mult[3] * table[3];
-	F38 = mult[4] * table[4];
-	F46 = mult[5] * table[5];
-	F54 = mult[6] * table[6];
-	F62 = mult[7] * table[7];
+	F6  = mult[48] * table[0];
+	F14 = mult[49] * table[1];
+	F22 = mult[50] * table[2];
+	F30 = mult[51] * table[3];
+	F38 = mult[52] * table[4];
+	F46 = mult[53] * table[5];
+	F54 = mult[54] * table[6];
+	F62 = mult[55] * table[7];
 
-	mult += 8;
 	table = &(fftTable[input[7] << 3]);
 
-	F7 = mult[0] * table[0];
-	F15 = mult[1] * table[1];
-	F23 = mult[2] * table[2];
-	F31 = mult[3] * table[3];
-	F39 = mult[4] * table[4];
-	F47 = mult[5] * table[5];
-	F55 = mult[6] * table[6];
-	F63 = mult[7] * table[7];
-
+	F7  = mult[56] * table[0];
+	F15 = mult[57] * table[1];
+	F23 = mult[58] * table[2];
+	F31 = mult[59] * table[3];
+	F39 = mult[60] * table[4];
+	F47 = mult[61] * table[5];
+	F55 = mult[62] * table[6];
+	F63 = mult[63] * table[7];
+
+   #define ADD_SUB( a, b ) \
+   { \
+      int temp = b; \
+      b = a - b; \
+      a = a + temp; \
+   }
+   
+   #define Q_REDUCE( a ) \
+      ( ( (a) & 0xff ) - ( (a) >> 8 ) )
+   
 /*
 
    for ( int i = 0; i < 8; i++ )
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
    }
 */
 
-
 	// Second loop unrolling:
 	// Iteration 0:
 	ADD_SUB(F0, F1);
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	output[47] = Q_REDUCE(F61);
 	output[55] = Q_REDUCE(F62);
 	output[63] = Q_REDUCE(F63);
+
+   #undef ADD_SUB
+   #undef Q_REDUCE
+
+#endif  // AVX2 elif SSE4.1 else
 }
 
 // Calculates the FFT part of SWIFFT.
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
 // - m: the input size divided by 64.
 // - output: will store the result.
 // - a: the coefficients in the sum. Of size 64 * m.
-void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
+void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
+                const swift_int16_t *a )
 {
 	int i, j;
-	swift_int32_t result[N];
+	swift_int32_t result[N] __attribute__ ((aligned (64)));
 	register swift_int16_t carry = 0;
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+   __m512i *res = (__m512i*)result;
+   for ( j = 0; j < N/16; ++j )
+   {
+      __m512i sum = _mm512_setzero_si512();
+      const __m512i *f = (__m512i*)input + j;
+      const __m512i *k = (__m512i*)a + j;
+      for ( i = 0; i < m; i++, f += N/16, k += N/16 )
+         sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#elif defined(__AVX2__)
+
+   __m256i *res = (__m256i*)result;
+   for ( j = 0; j < N/8; ++j )
+   {
+      __m256i sum = _mm256_setzero_si256();
+      const __m256i *f = (__m256i*)input + j;
+      const __m256i *k = (__m256i*)a + j;
+      for ( i = 0; i < m; i++, f += N/8, k += N/8 )
+         sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#elif defined(__SSE4_1__)
+
+   __m128i *res = (__m128i*)result;
+   for ( j = 0; j < N/4; ++j )
+   {
+      __m128i sum = _mm_setzero_si128();
+      const __m128i *f = (__m128i*)input + j;
+      const __m128i *k = (__m128i*)a + j;
+      for ( i = 0; i < m; i++, f += N/4, k += N/4 )
+         sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#else
+
 	for (j = 0; j < N; ++j)
 	{
 		register swift_int32_t sum = 0;
 		const register swift_int32_t *f = input + j;
 		const register swift_int16_t *k = a + j;
-
 		for (i = 0; i < m; i++, f += N,k += N)
 			sum += (*f) * (*k);
-
 		result[j] = sum;
 	}
 
+#endif
+
 	for (j = 0; j < N; ++j)
 		result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
 
@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
 	int i;
 	// Will store the result of the FFT parts:
-	swift_int32_t fftOut[N * M];
-	unsigned char intermediate[N * 3 + 8];
+	swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
+	unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
 	unsigned char carry0,carry1,carry2;
 
 	// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
    int i;
    // Will store the result of the FFT parts:
-   swift_int32_t fftOut[N * M];
-   unsigned char intermediate[N * 3 + 8];
+   swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
+   unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
    unsigned char carry0,carry1,carry2;
 
    // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
diff --git a/configure b/configure
index db3efc9f..ae0d7bec 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.0'
-PACKAGE_STRING='cpuminer-opt 3.18.0'
+PACKAGE_VERSION='3.18.1'
+PACKAGE_STRING='cpuminer-opt 3.18.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.18.0
+cpuminer-opt configure 3.18.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.18.0, which was
+It was created by cpuminer-opt $as_me 3.18.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.18.0'
+ VERSION='3.18.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.0, which was
+This file was extended by cpuminer-opt $as_me 3.18.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.0
+cpuminer-opt config.status 3.18.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index fbe5a9b0..869b3669 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.0])
+AC_INIT([cpuminer-opt], [3.18.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index c8895381..2a63729e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2083,7 +2083,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                            / ( opt_target_factor * opt_diff_factor );
    diff_to_hash( g_work->target, g_work->targetdiff );
 
-   // Increment extranonce2
+   // Pre increment extranonce2 in case of being called again before receiving
+   // a new job
    for ( int t = 0;
          t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
          t++ );
@@ -2103,20 +2104,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 
    pthread_mutex_unlock( &stats_lock );
 
-   if ( !opt_quiet )
-   {
-      int mismatch = submitted_share_count
-         - ( accepted_share_count + stale_share_count + rejected_share_count );
-      if ( mismatch )
-         applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
-   }
-
    if ( stratum_diff != sctx->job.diff )
       applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
                         sctx->job.diff, sctx->block_height, g_work->job_id );
    else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Job %s",
-                        sctx->block_height, g_work->job_id );
+      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
+                        sctx->block_height, net_diff, g_work->job_id );
    else if ( g_work->job_id && new_job )
       applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
                          sctx->block_height, net_diff, g_work->job_id );
@@ -2173,7 +2166,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                {
                   double net_hr = nd / net_ttf;
                   char net_hr_units[4] = {0};
-
                   scale_hash_for_display ( &net_hr, net_hr_units );
                   applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
                                      net_hr, net_hr_units );
@@ -2182,6 +2174,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
          }  // hr > 0
       } // !quiet
    }  // new diff/block
+
+   if ( new_job && !opt_quiet )
+   {
+      int mismatch = submitted_share_count - ( accepted_share_count
+                                             + stale_share_count
+                                             + rejected_share_count );
+      if ( mismatch )
+         applog( LOG_INFO,
+                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
+                 submitted_share_count );
+   }
 }
 
 static void *miner_thread( void *userdata )
@@ -3970,6 +3973,7 @@ int main(int argc, char *argv[])
    gettimeofday( &last_submit_time, NULL );
    memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
    memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
+   memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
    pthread_mutex_unlock( &stats_lock );
 
    applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 3d840107..1116976f 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_256_H__)
 #define SIMD_256_H__ 1
 
-#if defined(__AVX2__)
+//#if defined(__AVX2__)
 
 /////////////////////////////////////////////////////////////////////
 //
@@ -14,7 +14,9 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
 
-// Used instead if casting.
+#if defined(__AVX__)
+
+// Used instead of casting.
 typedef union
 {
    __m256i m256;
@@ -23,6 +25,28 @@ typedef union
    uint32_t u32[8];
 } __attribute__ ((aligned (32))) m256_ovly;
 
+//
+// Pointer casting
+
+// p = any aligned pointer
+// returns p as pointer to vector type, not very useful
+#define castp_m256i(p) ((__m256i*)(p))
+
+// p = any aligned pointer
+// returns *p, watch your pointer arithmetic
+#define cast_m256i(p) (*((__m256i*)(p)))
+
+// p = any aligned pointer, i = scaled array index
+// returns value p[i]
+#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
+
+// p = any aligned pointer, o = scaled offset
+// returns pointer p+o
+#define casto_m256i(p,o) (((__m256i*)(p))+(o))
+
+#endif
+#if defined(__AVX2__)
+
 
 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
 #define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
 #define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
 
-//
-// Pointer casting
-
-// p = any aligned pointer
-// returns p as pointer to vector type, not very useful
-#define castp_m256i(p) ((__m256i*)(p))
-
-// p = any aligned pointer
-// returns *p, watch your pointer arithmetic
-#define cast_m256i(p) (*((__m256i*)(p)))
-
-// p = any aligned pointer, i = scaled array index
-// returns value p[i]
-#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
-
-// p = any aligned pointer, o = scaled offset
-// returns pointer p+o
-#define casto_m256i(p,o) (((__m256i*)(p))+(o))
-
-
 //
 // Memory functions
 // n = number of 256 bit (32 byte) vectors
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index de948cc4..3cc090a4 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 
 // Rotate 256 bit lanes by one 64 bit element
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
-
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )
 
 // Rotate 256 bit lanes by one 32 bit element
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions
 
-// add shuflr shufll functions performing rotate, returning first arg
-// They're faster than doing both, when both not needed.
-
 // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
 // rotated v1 
 // visually confusing for shif2r because of arg order. First arg is always