From 58030e27886ff41612fe7676ea0ac75e815bbde9 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Mon, 1 Aug 2022 20:21:05 -0400
Subject: [PATCH] v3.20.2

---
 Makefile.am                       |    2 -
 RELEASE_NOTES                     |    5 +
 algo-gate-api.c                   |   12 +-
 algo/blake/blake256-hash-4way.c   |   79 +-
 algo/blake/blake2b-hash-4way.c    |    6 +-
 algo/blake/blake2s-hash-4way.c    |    8 +-
 algo/blake/blake512-hash-4way.c   |   58 +-
 algo/blake/sph_blake2b.c          |   55 +-
 algo/heavy/sph_hefty1.c           |  382 --------
 algo/heavy/sph_hefty1.h           |   66 --
 algo/lyra2/sponge.h               |   12 +-
 algo/radiogatun/sph_radiogatun.c  | 1003 ---------------------
 algo/radiogatun/sph_radiogatun.h  |  186 ----
 algo/x20/x20r-gate.c              |   34 -
 algo/x20/x20r-gate.h              |   58 --
 algo/x20/x20r.c                   |  252 ------
 algo/yescrypt/yescrypt-best.c     |    5 -
 algo/yescrypt/yescrypt-platform.h |  213 -----
 algo/yescrypt/yescrypt-simd.c     | 1392 -----------------------------
 algo/yescrypt/yescrypt.c          |  488 ----------
 algo/yescrypt/yescrypt.h          |  382 --------
 algo/yespower/yespower-gate.c     |    8 +-
 configure                         |   20 +-
 configure.ac                      |    2 +-
 simd-utils/simd-128.h             |  108 ++-
 simd-utils/simd-256.h             |  132 +--
 simd-utils/simd-512.h             |   76 +-
 27 files changed, 311 insertions(+), 4733 deletions(-)
 delete mode 100644 algo/heavy/sph_hefty1.c
 delete mode 100644 algo/heavy/sph_hefty1.h
 delete mode 100644 algo/radiogatun/sph_radiogatun.c
 delete mode 100644 algo/radiogatun/sph_radiogatun.h
 delete mode 100644 algo/x20/x20r-gate.c
 delete mode 100644 algo/x20/x20r-gate.h
 delete mode 100644 algo/x20/x20r.c
 delete mode 100644 algo/yescrypt/yescrypt-best.c
 delete mode 100644 algo/yescrypt/yescrypt-platform.h
 delete mode 100644 algo/yescrypt/yescrypt-simd.c
 delete mode 100644 algo/yescrypt/yescrypt.c
 delete mode 100644 algo/yescrypt/yescrypt.h

diff --git a/Makefile.am b/Makefile.am
index 82eeb6f6..b88b1e19 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -285,8 +285,6 @@ cpuminer_SOURCES = \
   algo/x22/x22i-gate.c \
   algo/x22/x25x.c \
   algo/x22/x25x-4way.c \
-  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/yescrypt-best.c \
   algo/yespower/yespower-gate.c \
   algo/yespower/yespower-blake2b.c \
   algo/yespower/crypto/hmac-blake2b.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 5e3c78b2..1f184898 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.20.2
+
+Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
+Removed old unused yescrypt library and other unused code.
+
 v3.20.1
 
 sph_blake2b optimized 1-way SSSE3 & AVX2.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index f34f5ac0..dcbad060 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -371,15 +371,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
     case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
     case ALGO_XEVAN:        rc = register_xevan_algo         ( gate ); break;
-    case ALGO_YESCRYPT:     rc = register_yescrypt_05_algo   ( gate ); break;
-//    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
-    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_05_algo ( gate ); break;
-//    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPT:     rc = register_yescrypt_algo      ( gate ); break;
+    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_algo    ( gate ); break;
     case ALGO_YESCRYPTR8G:  rc = register_yescryptr8g_algo   ( gate ); break;
-    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_05_algo( gate ); break;
-//    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
-    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_05_algo( gate ); break;
-//    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
+    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_algo   ( gate ); break;
+    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_algo   ( gate ); break;
     case ALGO_YESPOWER:     rc = register_yespower_algo      ( gate ); break;
     case ALGO_YESPOWERR16:  rc = register_yespowerr16_algo   ( gate ); break;
     case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo  ( gate ); break;
diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c
index c55e0133..d3260677 100644
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -400,18 +400,18 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 // Blake-256 4 way
 
 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
+{ \
    a = _mm_add_epi32( _mm_add_epi32( a, b ), \
                       _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
    c = _mm_add_epi32( c, d ); \
    b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
    a = _mm_add_epi32( _mm_add_epi32( a, b ), \
                       _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
+   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
    c = _mm_add_epi32( c, d ); \
    b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
-} while (0)
+}
 
 #if SPH_COMPACT_BLAKE_32
 
@@ -441,7 +441,8 @@ do { \
 
 #else
 
-#define ROUND_S_4WAY(r)   do { \
+#define ROUND_S_4WAY(r) \
+{ \
 	GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
 	GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
 	GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -450,7 +451,7 @@ do { \
 	GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
 	GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
 	GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
+}
 
 #endif
 
@@ -537,7 +538,7 @@ do { \
 
 #if defined(__SSSE3__)
 
-#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+#define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
    __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                           0x0405060700010203 ); \
@@ -557,11 +558,11 @@ do { \
    MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
    ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
    MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
-} while(0)
+}
 
 #else  // SSE2
 
-#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+#define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
    M0 = mm128_bswap_32( buf[0] ); \
    M1 = mm128_bswap_32( buf[1] ); \
@@ -579,12 +580,12 @@ do { \
    MD = mm128_bswap_32( buf[13] ); \
    ME = mm128_bswap_32( buf[14] ); \
    MF = mm128_bswap_32( buf[15] ); \
-} while(0)
+}
 
 #endif  // SSSE3 else SSE2
 
 #define COMPRESS32_4WAY( rounds ) \
-do { \
+{ \
    __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
    __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
    __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -631,7 +632,7 @@ do { \
    H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
    H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
    H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
-} while (0)
+}
 
 #endif
 
@@ -642,20 +643,21 @@ do { \
 // Blake-256 8 way
 
 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
+{ \
    a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
                          _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi32( c, d ); \
    b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
    a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
                          _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi32( c, d ); \
    b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
-} while (0)
+}
 
-#define ROUND_S_8WAY(r)   do { \
+#define ROUND_S_8WAY(r) \
+{ \
         GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
         GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
         GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -664,7 +666,7 @@ do { \
         GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
         GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
         GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
+}
 
 #define DECL_STATE32_8WAY \
    __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
@@ -699,7 +701,7 @@ do { \
 } while (0)
 
 #define COMPRESS32_8WAY( rounds ) \
-do { \
+{ \
    __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
    __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
    __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -764,10 +766,10 @@ do { \
    H5 = mm256_xor3( VD, V5, H5 ); \
    H6 = mm256_xor3( VE, V6, H6 ); \
    H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}
 
 #define COMPRESS32_8WAY_LE( rounds ) \
-do { \
+{ \
    __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
    __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
    __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -829,7 +831,7 @@ do { \
    H5 = mm256_xor3( VD, V5, H5 ); \
    H6 = mm256_xor3( VE, V6, H6 ); \
    H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}
 
 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                                        const void *data )
@@ -861,7 +863,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
    // G1   
    V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
                          _mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
-   V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
+   V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) );
    V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
    V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
    V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
@@ -881,7 +883,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
    // G7   
    V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
                          _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
-   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
+   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
    V[ 3] = _mm256_add_epi32( V[ 3],
                          _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
 }
@@ -935,18 +937,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
    // G1   
    V1 = _mm256_add_epi32( V1,
                          _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
-   VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
+   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) );
    V9 = _mm256_add_epi32( V9, VD );
    V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
 
    // G4
    V0 = _mm256_add_epi32( V0, V5 );
-   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
+   VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) );
    VA = _mm256_add_epi32( VA, VF );
    V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
    V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
                          _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
-   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
+   VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
    VA = _mm256_add_epi32( VA, VF );
    V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
 
@@ -954,12 +956,12 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
    GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
 
    // G6
-   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
+   VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
    V8 = _mm256_add_epi32( V8, VD );
    V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
    V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
                          _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
-   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
+   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
    V8 = _mm256_add_epi32( V8, VD );
    V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
 
@@ -967,7 +969,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
    V9 = _mm256_add_epi32( V9, VE );
    V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
    V3 = _mm256_add_epi32( V3, V4 );
-   VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
+   VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) );
    V9 = _mm256_add_epi32( V9, VE );
    V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
 
@@ -1009,7 +1011,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 // Blake-256 16 way AVX512
 
 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
+{ \
    a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
                          _mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
    d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
@@ -1020,9 +1022,10 @@ do { \
    d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
    c = _mm512_add_epi32( c, d ); \
    b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
-} while (0)
+}
 
-#define ROUND_S_16WAY(r)   do { \
+#define ROUND_S_16WAY(r) \
+{ \
         GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
         GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
         GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -1031,7 +1034,7 @@ do { \
         GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
         GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
         GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
+}
 
 #define DECL_STATE32_16WAY \
    __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
@@ -1066,7 +1069,7 @@ do { \
 } while (0)
 
 #define COMPRESS32_16WAY( rounds ) \
-do { \
+{ \
    __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
    __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
    __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -1133,10 +1136,10 @@ do { \
    H5 = mm512_xor3( VD, V5, H5 ); \
    H6 = mm512_xor3( VE, V6, H6 ); \
    H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}
 
 #define COMPRESS32_16WAY_LE( rounds ) \
-do { \
+{ \
    __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
    __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
    __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -1198,7 +1201,7 @@ do { \
    H5 = mm512_xor3( VD, V5, H5 ); \
    H6 = mm512_xor3( VE, V6, H6 ); \
    H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}
 
 // Blake-256 prehash of the second block is split onto 2 parts. The first part
 // is constant for every nonce and only needs to be run once per job. The
diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c
index d04601f3..6437c7ba 100644
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -388,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
 #define B2B_G(a, b, c, d, x, y) \
 { \
    v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
-	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
+	v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
-	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
+	v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
 	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
-	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
+	v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
 	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
 }
diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c
index 190ad0b7..a69e5010 100644
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -108,11 +108,11 @@ do { \
    uint8_t s0 = sigma0; \
    uint8_t s1 = sigma1; \
    a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
    c = _mm_add_epi32( c, d ); \
    b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
    a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ),  8 ); \
+   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
    c = _mm_add_epi32( c, d ); \
    b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
 } while(0)
@@ -320,11 +320,11 @@ do { \
    uint8_t s0 = sigma0; \
    uint8_t s1 = sigma1; \
    a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi32( c, d ); \
    b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
    a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ),  8 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi32( c, d ); \
    b = mm256_ror_32( _mm256_xor_si256( b, c ),  7 ); \
 } while(0)
diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c
index 5c947e18..ebd5d958 100644
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -314,10 +314,11 @@ static const sph_u64 CB[16] = {
 
 // Blake-512 8 way AVX512
 
-#define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
+{ \
    a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
                  _mm512_set1_epi64( c1 ), m0 ), b ), a ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
    c = _mm512_add_epi64( c, d ); \
    b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
    a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
@@ -325,9 +326,10 @@ static const sph_u64 CB[16] = {
    d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
    c = _mm512_add_epi64( c, d ); \
    b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
-} while (0)
+}
 
-#define ROUND_B_8WAY(r)   do { \
+#define ROUND_B_8WAY( r ) \
+{ \
    GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
    GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
    GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -336,13 +338,13 @@ static const sph_u64 CB[16] = {
    GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
    GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
    GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-   } while (0)
+}
 
 #define DECL_STATE64_8WAY \
    __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
    uint64_t T0, T1;
 
-#define COMPRESS64_8WAY( buf )   do \
+#define COMPRESS64_8WAY( buf ) \
 { \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -409,7 +411,7 @@ static const sph_u64 CB[16] = {
   H5 = mm512_xor3( VD, V5, H5 ); \
   H6 = mm512_xor3( VE, V6, H6 ); \
   H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}
 
 void blake512_8way_compress( blake_8way_big_context *sc )
 { 
@@ -610,7 +612,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
 
    V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( 
                        _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); 
-   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); 
+   VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) ); 
    VA = _mm512_add_epi64( VA, VF ); 
    V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
    V0 = _mm512_add_epi64( V0, V5 );
@@ -714,7 +716,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
 
    V1 = _mm512_add_epi64( V1, V5 );   
-   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
+   VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
    V9 = _mm512_add_epi64( V9, VD );
    V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
    V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
@@ -728,7 +730,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V2 = _mm512_add_epi64( V2, V6 );
    V2 = _mm512_add_epi64( V2, _mm512_xor_si512( 
                  _mm512_set1_epi64( CBF ), M9 ) );
-   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
+   VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
    VA = _mm512_add_epi64( VA, VE );
    V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
    V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
@@ -742,7 +744,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( 
 //                 _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); 
 
-   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); 
+   VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) ); 
    VB = _mm512_add_epi64( VB, VF ); 
    V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
    V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
@@ -1054,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst)
 
 // Blake-512 4 way
 
-#define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
+{ \
    a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                  _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi64( c, d ); \
    b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
    a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                  _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi64( c, d ); \
    b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
-} while (0)
+}
 
-#define ROUND_B_4WAY(r)   do { \
+#define ROUND_B_4WAY(r) \
+{ \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
 	GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -1076,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst)
 	GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
 	GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
 	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-	} while (0)
+}
 
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
 	uint64_t T0, T1;
 
-#define COMPRESS64_4WAY   do \
+#define COMPRESS64_4WAY \
 { \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -1147,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst)
   H5 = mm256_xor3( VD, V5, H5 ); \
   H6 = mm256_xor3( VE, V6, H6 ); \
   H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}
 
 
 void blake512_4way_compress( blake_4way_big_context *sc )
@@ -1277,7 +1281,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
    // G4 skip nonce
    V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
                        _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
+   VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
    VA = _mm256_add_epi64( VA, VF );
    V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
    V0 = _mm256_add_epi64( V0, V5 );
@@ -1364,7 +1368,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
    // finish round 0, with the nonce now available 
    V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
                                        _mm256_set1_epi64x( CB8 ), M9 ) );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
+   VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
    VA = _mm256_add_epi64( VA, VF );
    V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
 
@@ -1374,34 +1378,34 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
 
    // G1
    V1 = _mm256_add_epi64( V1, V5 );
-   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
+   VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) );
    V9 = _mm256_add_epi64( V9, VD );
    V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
    V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
                  _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
-   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
+   VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
    V9 = _mm256_add_epi64( V9, VD );
    V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
 
    // G2
    V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
                  _mm256_set1_epi64x( CBF ), M9 ) );
-   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
+   VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
    VA = _mm256_add_epi64( VA, VE );
    V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
    V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
                  _mm256_set1_epi64x( CB9 ), MF ), V6 ) );
-   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
+   VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
    VA = _mm256_add_epi64( VA, VE );
    V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
 
    // G3
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
+   VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) );
    VB = _mm256_add_epi64( VB, VF );
    V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
    V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
                  _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
+   VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
    VB = _mm256_add_epi64( VB, VF );
    V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
 
diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c
index 9e13fe4f..159d6076 100644
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -35,7 +35,6 @@
 #include "sph_blake2b.h"
 
 // Little-endian byte access.
-
 #define B2B_GET64(p)                            \
 	(((uint64_t) ((uint8_t *) (p))[0]) ^        \
 	(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
@@ -46,30 +45,34 @@
 	(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
 	(((uint64_t) ((uint8_t *) (p))[7]) << 56))
 
-// G Mixing function.
-
 #if defined(__AVX2__)
 
-#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
+#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
 { \
   V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
-              _mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
-                                 m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
-  V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
+              _mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
+                                 m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+  V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
+  V[2] = _mm256_add_epi64( V[2], V[3] ); \
+  V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
+\
+  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
+              _mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
+                                 m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+  V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
   V[2] = _mm256_add_epi64( V[2], V[3] ); \
-  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
+  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
 }
 
 #define BLAKE2B_ROUND( R ) \
 { \
   __m256i *V = (__m256i*)v; \
-  BLAKE2B_G( R,  0,  2,  4,  6, 32, 24 ); \
-  BLAKE2B_G( R,  1,  3,  5,  7, 16, 63 ); \
+  const uint8_t *sigmaR = sigma[R]; \
+  BLAKE2B_G(  0,  1,  2,  3,  4,  5,  6,  7 ); \
   V[3] = mm256_shufll_64( V[3] ); \
   V[2] = mm256_swap_128( V[2] ); \
   V[1] = mm256_shuflr_64( V[1] ); \
-  BLAKE2B_G( R,  8, 10, 12, 14, 32, 24 ); \
-  BLAKE2B_G( R,  9, 11, 13, 15, 16, 63 ); \
+  BLAKE2B_G(  8,  9, 10, 11, 12, 13, 14, 15 ); \
   V[3] = mm256_shuflr_64( V[3] ); \
   V[2] = mm256_swap_128( V[2] ); \
   V[1] = mm256_shufll_64( V[1] ); \
@@ -77,31 +80,34 @@
 
 #elif defined(__SSSE3__)
 
-#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
+#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
    Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                 _mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
-   Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
+                 _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
+   Vc = _mm_add_epi64( Vc, Vd ); \
+   Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
+\
+   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
+                 _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
    Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
+   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
 }
 
 #define BLAKE2B_ROUND( R ) \
 { \
    __m128i *V = (__m128i*)v; \
    __m128i V2, V3, V6, V7; \
-   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
-   BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
-   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
-   BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
+   const uint8_t *sigmaR = sigma[R]; \
+   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
+   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
    V2 = mm128_shufl2r_64( V[2], V[3] ); \
    V3 = mm128_shufl2r_64( V[3], V[2] ); \
    V6 = mm128_shufl2l_64( V[6], V[7] ); \
    V7 = mm128_shufl2l_64( V[7], V[6] ); \
-   BLAKE2B_G( R, V[0], V2, V[5], V6,  8, 10, 32, 24 ); \
-   BLAKE2B_G( R, V[0], V2, V[5], V6,  9, 11, 16, 63 ); \
-   BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
-   BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
+   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
+   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
    V[2] = mm128_shufl2l_64( V2, V3 ); \
    V[3] = mm128_shufl2l_64( V3, V2 ); \
    V[6] = mm128_shufl2r_64( V6, V7 ); \
@@ -120,6 +126,7 @@
    Vd = ROTR64( Vd ^ Va, 32 ); \
    Vc = Vc + Vd; \
    Vb = ROTR64( Vb ^ Vc, 24 ); \
+\
    Va = Va + Vb + m[ sigma[R][Sb] ]; \
    Vd = ROTR64( Vd ^ Va, 16 ); \
    Vc = Vc + Vd; \
diff --git a/algo/heavy/sph_hefty1.c b/algo/heavy/sph_hefty1.c
deleted file mode 100644
index 8a8203cf..00000000
--- a/algo/heavy/sph_hefty1.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * HEFTY1 cryptographic hash function
- *
- * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those
- * of the authors and should not be interpreted as representing official policies,
- * either expressed or implied, of the FreeBSD Project.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-#include "sph_hefty1.h"
-
-#define Min(A, B) (A <= B ? A : B)
-#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K)                    \
-    {                                                                   \
-        /* To thwart parallelism, Br modifies itself each time it's     \
-         * called.  This also means that calling it in different        \
-         * orders yeilds different results.  In C the order of          \
-         * evaluation of function arguments and + operands are          \
-         * unspecified (and depends on the compiler), so we must make   \
-         * the order of Br calls explicit.                              \
-         */                                                             \
-        uint32_t brG = Br(ctx, G);                                      \
-        uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K;             \
-        uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E));                      \
-        uint32_t brC = Br(ctx, C);                                      \
-        uint32_t brB = Br(ctx, B);                                      \
-        uint32_t tmp3 = Ma(Br(ctx, A), brB, brC);                       \
-        uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A));                      \
-        H = G;                                                          \
-        G = F;                                                          \
-        F = E;                                                          \
-        E = D + Br(ctx, tmp2);                                          \
-        D = C;                                                          \
-        C = B;                                                          \
-        B = A;                                                          \
-        A = tmp2 + tmp4;                                                \
-    }                                                                   \
-
-/* Nothing up my sleeve constants */
-const static uint32_t K[64] = {
-    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
-    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
-    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
-    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
-    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
-    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
-    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
-    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
-    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
-    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
-    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
-    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
-    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
-    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
-    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
-    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
-};
-
-/* Initial hash values */
-const static uint32_t H[HEFTY1_STATE_WORDS] = {
-    0x6a09e667UL,
-    0xbb67ae85UL,
-    0x3c6ef372UL,
-    0xa54ff53aUL,
-    0x510e527fUL,
-    0x9b05688cUL,
-    0x1f83d9abUL,
-    0x5be0cd19UL
-};
-
-static inline uint32_t Rr(uint32_t X, uint8_t n)
-{
-    return (X >> n) | (X << (32 - n));
-}
-
-static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
-{
-    return (E & F) ^ (~E & G);
-}
-
-static inline uint32_t Sigma1(uint32_t E)
-{
-    return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
-}
-
-static inline uint32_t sigma1(uint32_t X)
-{
-    return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
-}
-
-static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
-{
-    return (A & B) ^ (A & C) ^ (B & C);
-}
-
-static inline uint32_t Sigma0(uint32_t A)
-{
-    return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
-}
-
-static inline uint32_t sigma0(uint32_t X)
-{
-    return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
-}
-
-static inline uint32_t Reverse32(uint32_t n)
-{
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
-    #else
-        return n;
-    #endif
-}
-
-static inline uint64_t Reverse64(uint64_t n)
-{
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        uint32_t a = n >> 32;
-        uint32_t b = (n << 32) >> 32;
-
-        return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
-    #else
-        return n;
-    #endif
-}
-
-/* Smoosh byte into nibble */
-static inline uint8_t Smoosh4(uint8_t X)
-{
-    return (X >> 4) ^ (X & 0xf);
-}
-
-/* Smoosh 32-bit word into 2-bits */
-static inline uint8_t Smoosh2(uint32_t X)
-{
-    uint16_t w = (X >> 16) ^ (X & 0xffff);
-    uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
-    return (n >> 2) ^ (n & 0x3);
-}
-
-static void Mangle(uint32_t *S)
-{
-    uint32_t *R = S;
-    uint32_t *C = &S[1];
-
-    uint8_t r0 = Smoosh4(R[0] >> 24);
-    uint8_t r1 = Smoosh4(R[0] >> 16);
-    uint8_t r2 = Smoosh4(R[0] >> 8);
-    uint8_t r3 = Smoosh4(R[0] & 0xff);
-
-    int i;
-
-    /* Diffuse */
-    uint32_t tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
-        uint8_t r = Smoosh2(tmp);
-        switch (r) {
-        case 0:
-            C[i] ^= Rr(R[0], i + r0);
-            break;
-        case 1:
-            C[i] += Rr(~R[0], i + r1);
-            break;
-        case 2:
-            C[i] &= Rr(~R[0], i + r2);
-            break;
-        case 3:
-            C[i] ^= Rr(R[0], i + r3);
-            break;
-        }
-        tmp ^= C[i];
-    }
-
-    /* Compress */
-    tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
-        if (i % 2)
-            tmp ^= C[i];
-        else
-            tmp += C[i];
-    R[0] ^= tmp;
-}
-
-static void Absorb(uint32_t *S, uint32_t X)
-{
-    uint32_t *R = S;
-    R[0] ^= X;
-    Mangle(S);
-}
-
-static uint32_t Squeeze(uint32_t *S)
-{
-    uint32_t Y = S[0];
-    Mangle(S);
-    return Y;
-}
-
-/* Branch, compress and serialize function */
-static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
-{
-    uint32_t R = Squeeze(ctx->sponge);
-
-    uint8_t r0 = R >> 8;
-    uint8_t r1 = R & 0xff;
-
-    uint32_t Y = 1 << (r0 % 32);
-
-    switch (r1 % 4)
-    {
-    case 0:
-        /* Do nothing */
-        break;
-    case 1:
-        return X & ~Y;
-    case 2:
-        return X | Y;
-    case 3:
-        return X ^ Y;
-    }
-
-    return X;
-}
-
-static void HashBlock(HEFTY1_CTX *ctx)
-{
-    uint32_t A, B, C, D, E, F, G, H;
-    uint32_t W[HEFTY1_BLOCK_BYTES];
-
-    assert(ctx);
-
-    A = ctx->h[0];
-    B = ctx->h[1];
-    C = ctx->h[2];
-    D = ctx->h[3];
-    E = ctx->h[4];
-    F = ctx->h[5];
-    G = ctx->h[6];
-    H = ctx->h[7];
-
-    int t = 0;
-    for (; t < 16; t++) {
-        W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
-        Absorb(ctx->sponge, W[t] ^ K[t]);
-    }
-
-    for (t = 0; t < 16; t++) {
-        Absorb(ctx->sponge, D ^ H);
-        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
-    }
-    for (t = 16; t < 64; t++) {
-        Absorb(ctx->sponge, H + D);
-        W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
-        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
-    }
-
-    ctx->h[0] += A;
-    ctx->h[1] += B;
-    ctx->h[2] += C;
-    ctx->h[3] += D;
-    ctx->h[4] += E;
-    ctx->h[5] += F;
-    ctx->h[6] += G;
-    ctx->h[7] += H;
-
-    A = 0;
-    B = 0;
-    C = 0;
-    D = 0;
-    E = 0;
-    F = 0;
-    G = 0;
-    H = 0;
-
-    memset(W, 0, sizeof(W));
-}
-
-/* Public interface */
-
-void HEFTY1_Init(HEFTY1_CTX *ctx)
-{
-    assert(ctx);
-
-    memcpy(ctx->h, H, sizeof(ctx->h));
-    memset(ctx->block, 0, sizeof(ctx->block));
-    ctx->written = 0;
-    memset(ctx->sponge, 0, sizeof(ctx->sponge));
-}
-
-void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
-{
-    assert(ctx);
-
-    uint64_t read = 0;
-    while (len) {
-        size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
-        size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
-        memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
-        len -= count;
-        read += count;
-        ctx->written += count;
-        if (!(ctx->written % HEFTY1_BLOCK_BYTES))
-            HashBlock(ctx);
-    }
-}
-
-void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
-{
-    assert(digest);
-    assert(ctx);
-
-    /* Pad message (FIPS 180 Section 5.1.1) */
-    size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
-    ctx->block[used++] = 0x80; /* Append 1 to end of message */
-    if (used > HEFTY1_BLOCK_BYTES - 8) {
-        /* We have already written into the last 64bits, so
-         * we must continue into the next block. */
-        memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
-        HashBlock(ctx);
-        used = 0; /* Create a new block (below) */
-    }
-
-    /* All remaining bits to zero */
-    memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
-
-    /* The last 64bits encode the length (in network byte order) */
-    uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
-    *len = Reverse64(ctx->written*8);
-
-    HashBlock(ctx);
-
-    /* Convert back to network byte order */
-    int i = 0;
-    for (; i < HEFTY1_STATE_WORDS; i++)
-        ctx->h[i] = Reverse32(ctx->h[i]);
-
-    memcpy(digest, ctx->h, sizeof(ctx->h));
-    memset(ctx, 0, sizeof(HEFTY1_CTX));
-}
-
-unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
-{
-    HEFTY1_CTX ctx;
-    static unsigned char m[HEFTY1_DIGEST_BYTES];
-
-    if (!digest)
-        digest = m;
-
-    HEFTY1_Init(&ctx);
-    HEFTY1_Update(&ctx, buf, len);
-    HEFTY1_Final(digest, &ctx);
-
-    return digest;
-}
\ No newline at end of file
diff --git a/algo/heavy/sph_hefty1.h b/algo/heavy/sph_hefty1.h
deleted file mode 100644
index afcd274f..00000000
--- a/algo/heavy/sph_hefty1.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * HEFTY1 cryptographic hash function
- *
- * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those
- * of the authors and should not be interpreted as representing official policies,
- * either expressed or implied, of the FreeBSD Project.
- */
-
-#ifndef __HEFTY1_H__
-#define __HEFTY1_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef WIN32
-#include <sys/types.h>
-#endif
-
-#include <inttypes.h>
-
-#define HEFTY1_DIGEST_BYTES 32
-#define HEFTY1_BLOCK_BYTES 64
-#define HEFTY1_STATE_WORDS 8
-#define HEFTY1_SPONGE_WORDS 4
-
-typedef struct HEFTY1_CTX {
-    uint32_t h[HEFTY1_STATE_WORDS];
-    uint8_t  block[HEFTY1_BLOCK_BYTES];
-    uint64_t written;
-    uint32_t sponge[HEFTY1_SPONGE_WORDS];
-} HEFTY1_CTX;
-
-void HEFTY1_Init(HEFTY1_CTX *cxt);
-void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
-void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
-unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __HEFTY1_H__ */
\ No newline at end of file
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 2385640e..34df0cc0 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -97,11 +97,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
    a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi64( c, d ); \
-   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
+   b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
    a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
    c = _mm256_add_epi64( c, d ); \
    b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
 
@@ -137,11 +137,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
    a = _mm_add_epi64( a, b ); \
-   d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
+   d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
    c = _mm_add_epi64( c, d ); \
-   b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
+   b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
    a = _mm_add_epi64( a, b ); \
-   d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
    c = _mm_add_epi64( c, d ); \
    b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
 
diff --git a/algo/radiogatun/sph_radiogatun.c b/algo/radiogatun/sph_radiogatun.c
deleted file mode 100644
index 888b028f..00000000
--- a/algo/radiogatun/sph_radiogatun.c
+++ /dev/null
@@ -1,1003 +0,0 @@
-/* $Id: radiogatun.c 226 2010-06-16 17:28:08Z tp $ */
-/*
- * RadioGatun implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include <stddef.h>
-#include <string.h>
-
-#include "sph_radiogatun.h"
-
-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_RADIOGATUN
-#define SPH_SMALL_FOOTPRINT_RADIOGATUN   1
-#endif
-
-/* ======================================================================= */
-/*
- * The core macros. We want to unroll 13 successive rounds so that the
- * belt rotation becomes pure routing, solved at compilation time, with
- * no unnecessary copying. We also wish all state variables to be
- * independant local variables, so that the C compiler becomes free to
- * map these on registers at it sees fit. This requires some heavy
- * preprocessor trickeries, including a full addition macro modulo 13.
- *
- * These macros are size-independent. Some macros must be defined before
- * use:
- *   WT           evaluates to the type for a word (32-bit or 64-bit)
- *   T            truncates a value to the proper word size
- *   ROR(x, n)    right rotation of a word x, with explicit modular
- *                reduction of the rotation count n by the word size
- *   INW(i, j)    input word j (0, 1, or 2) of block i (0 to 12)
- *
- * For INW, the input buffer is pointed to by "buf" which has type
- * "const unsigned char *".
- */
-
-#define MUL19(action)   do { \
-		action(0); \
-		action(1); \
-		action(2); \
-		action(3); \
-		action(4); \
-		action(5); \
-		action(6); \
-		action(7); \
-		action(8); \
-		action(9); \
-		action(10); \
-		action(11); \
-		action(12); \
-		action(13); \
-		action(14); \
-		action(15); \
-		action(16); \
-		action(17); \
-		action(18); \
-	} while (0)
-
-#define DECL19(b)   b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
-                    b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
-                    b ## 12, b ## 13, b ## 14, b ## 15, b ## 16, \
-                    b ## 17, b ## 18
-
-#define M19_T7(i)    M19_T7_(i)
-#define M19_T7_(i)   M19_T7_ ## i
-#define M19_T7_0     0
-#define M19_T7_1     7
-#define M19_T7_2     14
-#define M19_T7_3     2
-#define M19_T7_4     9
-#define M19_T7_5     16
-#define M19_T7_6     4
-#define M19_T7_7     11
-#define M19_T7_8     18
-#define M19_T7_9     6
-#define M19_T7_10    13
-#define M19_T7_11    1
-#define M19_T7_12    8
-#define M19_T7_13    15
-#define M19_T7_14    3
-#define M19_T7_15    10
-#define M19_T7_16    17
-#define M19_T7_17    5
-#define M19_T7_18    12
-
-#define M19_A1(i)    M19_A1_(i)
-#define M19_A1_(i)   M19_A1_ ## i
-#define M19_A1_0     1
-#define M19_A1_1     2
-#define M19_A1_2     3
-#define M19_A1_3     4
-#define M19_A1_4     5
-#define M19_A1_5     6
-#define M19_A1_6     7
-#define M19_A1_7     8
-#define M19_A1_8     9
-#define M19_A1_9     10
-#define M19_A1_10    11
-#define M19_A1_11    12
-#define M19_A1_12    13
-#define M19_A1_13    14
-#define M19_A1_14    15
-#define M19_A1_15    16
-#define M19_A1_16    17
-#define M19_A1_17    18
-#define M19_A1_18    0
-
-#define M19_A2(i)    M19_A2_(i)
-#define M19_A2_(i)   M19_A2_ ## i
-#define M19_A2_0     2
-#define M19_A2_1     3
-#define M19_A2_2     4
-#define M19_A2_3     5
-#define M19_A2_4     6
-#define M19_A2_5     7
-#define M19_A2_6     8
-#define M19_A2_7     9
-#define M19_A2_8     10
-#define M19_A2_9     11
-#define M19_A2_10    12
-#define M19_A2_11    13
-#define M19_A2_12    14
-#define M19_A2_13    15
-#define M19_A2_14    16
-#define M19_A2_15    17
-#define M19_A2_16    18
-#define M19_A2_17    0
-#define M19_A2_18    1
-
-#define M19_A4(i)    M19_A4_(i)
-#define M19_A4_(i)   M19_A4_ ## i
-#define M19_A4_0     4
-#define M19_A4_1     5
-#define M19_A4_2     6
-#define M19_A4_3     7
-#define M19_A4_4     8
-#define M19_A4_5     9
-#define M19_A4_6     10
-#define M19_A4_7     11
-#define M19_A4_8     12
-#define M19_A4_9     13
-#define M19_A4_10    14
-#define M19_A4_11    15
-#define M19_A4_12    16
-#define M19_A4_13    17
-#define M19_A4_14    18
-#define M19_A4_15    0
-#define M19_A4_16    1
-#define M19_A4_17    2
-#define M19_A4_18    3
-
-#define ACC_a(i)    ACC_a_(i)
-#define ACC_a_(i)   a ## i
-#define ACC_atmp(i)    ACC_atmp_(i)
-#define ACC_atmp_(i)   atmp ## i
-
-#define MILL1(i)   (atmp ## i = a ## i ^ T(ACC_a(M19_A1(i)) \
-                   | ~ACC_a(M19_A2(i))))
-#define MILL2(i)   (a ## i = ROR(ACC_atmp(M19_T7(i)), ((i * (i + 1)) >> 1)))
-#define MILL3(i)   (atmp ## i = a ## i ^ ACC_a(M19_A1(i)) ^ ACC_a(M19_A4(i)))
-#define MILL4(i)   (a ## i = atmp ## i ^ (i == 0))
-
-#define MILL   do { \
-		WT DECL19(atmp); \
-		MUL19(MILL1); \
-		MUL19(MILL2); \
-		MUL19(MILL3); \
-		MUL19(MILL4); \
-	} while (0)
-
-#define DECL13(b)   b ## 0 ## _0, b ## 0 ## _1, b ## 0 ## _2, \
-                    b ## 1 ## _0, b ## 1 ## _1, b ## 1 ## _2, \
-                    b ## 2 ## _0, b ## 2 ## _1, b ## 2 ## _2, \
-                    b ## 3 ## _0, b ## 3 ## _1, b ## 3 ## _2, \
-                    b ## 4 ## _0, b ## 4 ## _1, b ## 4 ## _2, \
-                    b ## 5 ## _0, b ## 5 ## _1, b ## 5 ## _2, \
-                    b ## 6 ## _0, b ## 6 ## _1, b ## 6 ## _2, \
-                    b ## 7 ## _0, b ## 7 ## _1, b ## 7 ## _2, \
-                    b ## 8 ## _0, b ## 8 ## _1, b ## 8 ## _2, \
-                    b ## 9 ## _0, b ## 9 ## _1, b ## 9 ## _2, \
-                    b ## 10 ## _0, b ## 10 ## _1, b ## 10 ## _2, \
-                    b ## 11 ## _0, b ## 11 ## _1, b ## 11 ## _2, \
-                    b ## 12 ## _0, b ## 12 ## _1, b ## 12 ## _2
-
-#define M13_A(i, j)    M13_A_(i, j)
-#define M13_A_(i, j)   M13_A_ ## i ## _ ## j
-#define M13_A_0_0      0
-#define M13_A_0_1      1
-#define M13_A_0_2      2
-#define M13_A_0_3      3
-#define M13_A_0_4      4
-#define M13_A_0_5      5
-#define M13_A_0_6      6
-#define M13_A_0_7      7
-#define M13_A_0_8      8
-#define M13_A_0_9      9
-#define M13_A_0_10     10
-#define M13_A_0_11     11
-#define M13_A_0_12     12
-#define M13_A_1_0      1
-#define M13_A_1_1      2
-#define M13_A_1_2      3
-#define M13_A_1_3      4
-#define M13_A_1_4      5
-#define M13_A_1_5      6
-#define M13_A_1_6      7
-#define M13_A_1_7      8
-#define M13_A_1_8      9
-#define M13_A_1_9      10
-#define M13_A_1_10     11
-#define M13_A_1_11     12
-#define M13_A_1_12     0
-#define M13_A_2_0      2
-#define M13_A_2_1      3
-#define M13_A_2_2      4
-#define M13_A_2_3      5
-#define M13_A_2_4      6
-#define M13_A_2_5      7
-#define M13_A_2_6      8
-#define M13_A_2_7      9
-#define M13_A_2_8      10
-#define M13_A_2_9      11
-#define M13_A_2_10     12
-#define M13_A_2_11     0
-#define M13_A_2_12     1
-#define M13_A_3_0      3
-#define M13_A_3_1      4
-#define M13_A_3_2      5
-#define M13_A_3_3      6
-#define M13_A_3_4      7
-#define M13_A_3_5      8
-#define M13_A_3_6      9
-#define M13_A_3_7      10
-#define M13_A_3_8      11
-#define M13_A_3_9      12
-#define M13_A_3_10     0
-#define M13_A_3_11     1
-#define M13_A_3_12     2
-#define M13_A_4_0      4
-#define M13_A_4_1      5
-#define M13_A_4_2      6
-#define M13_A_4_3      7
-#define M13_A_4_4      8
-#define M13_A_4_5      9
-#define M13_A_4_6      10
-#define M13_A_4_7      11
-#define M13_A_4_8      12
-#define M13_A_4_9      0
-#define M13_A_4_10     1
-#define M13_A_4_11     2
-#define M13_A_4_12     3
-#define M13_A_5_0      5
-#define M13_A_5_1      6
-#define M13_A_5_2      7
-#define M13_A_5_3      8
-#define M13_A_5_4      9
-#define M13_A_5_5      10
-#define M13_A_5_6      11
-#define M13_A_5_7      12
-#define M13_A_5_8      0
-#define M13_A_5_9      1
-#define M13_A_5_10     2
-#define M13_A_5_11     3
-#define M13_A_5_12     4
-#define M13_A_6_0      6
-#define M13_A_6_1      7
-#define M13_A_6_2      8
-#define M13_A_6_3      9
-#define M13_A_6_4      10
-#define M13_A_6_5      11
-#define M13_A_6_6      12
-#define M13_A_6_7      0
-#define M13_A_6_8      1
-#define M13_A_6_9      2
-#define M13_A_6_10     3
-#define M13_A_6_11     4
-#define M13_A_6_12     5
-#define M13_A_7_0      7
-#define M13_A_7_1      8
-#define M13_A_7_2      9
-#define M13_A_7_3      10
-#define M13_A_7_4      11
-#define M13_A_7_5      12
-#define M13_A_7_6      0
-#define M13_A_7_7      1
-#define M13_A_7_8      2
-#define M13_A_7_9      3
-#define M13_A_7_10     4
-#define M13_A_7_11     5
-#define M13_A_7_12     6
-#define M13_A_8_0      8
-#define M13_A_8_1      9
-#define M13_A_8_2      10
-#define M13_A_8_3      11
-#define M13_A_8_4      12
-#define M13_A_8_5      0
-#define M13_A_8_6      1
-#define M13_A_8_7      2
-#define M13_A_8_8      3
-#define M13_A_8_9      4
-#define M13_A_8_10     5
-#define M13_A_8_11     6
-#define M13_A_8_12     7
-#define M13_A_9_0      9
-#define M13_A_9_1      10
-#define M13_A_9_2      11
-#define M13_A_9_3      12
-#define M13_A_9_4      0
-#define M13_A_9_5      1
-#define M13_A_9_6      2
-#define M13_A_9_7      3
-#define M13_A_9_8      4
-#define M13_A_9_9      5
-#define M13_A_9_10     6
-#define M13_A_9_11     7
-#define M13_A_9_12     8
-#define M13_A_10_0     10
-#define M13_A_10_1     11
-#define M13_A_10_2     12
-#define M13_A_10_3     0
-#define M13_A_10_4     1
-#define M13_A_10_5     2
-#define M13_A_10_6     3
-#define M13_A_10_7     4
-#define M13_A_10_8     5
-#define M13_A_10_9     6
-#define M13_A_10_10    7
-#define M13_A_10_11    8
-#define M13_A_10_12    9
-#define M13_A_11_0     11
-#define M13_A_11_1     12
-#define M13_A_11_2     0
-#define M13_A_11_3     1
-#define M13_A_11_4     2
-#define M13_A_11_5     3
-#define M13_A_11_6     4
-#define M13_A_11_7     5
-#define M13_A_11_8     6
-#define M13_A_11_9     7
-#define M13_A_11_10    8
-#define M13_A_11_11    9
-#define M13_A_11_12    10
-#define M13_A_12_0     12
-#define M13_A_12_1     0
-#define M13_A_12_2     1
-#define M13_A_12_3     2
-#define M13_A_12_4     3
-#define M13_A_12_5     4
-#define M13_A_12_6     5
-#define M13_A_12_7     6
-#define M13_A_12_8     7
-#define M13_A_12_9     8
-#define M13_A_12_10    9
-#define M13_A_12_11    10
-#define M13_A_12_12    11
-
-#define M13_N(i)    M13_N_(i)
-#define M13_N_(i)   M13_N_ ## i
-#define M13_N_0     12
-#define M13_N_1     11
-#define M13_N_2     10
-#define M13_N_3     9
-#define M13_N_4     8
-#define M13_N_5     7
-#define M13_N_6     6
-#define M13_N_7     5
-#define M13_N_8     4
-#define M13_N_9     3
-#define M13_N_10    2
-#define M13_N_11    1
-#define M13_N_12    0
-
-#define ACC_b(i, k)    ACC_b_(i, k)
-#define ACC_b_(i, k)   b ## i ## _ ## k
-
-#define ROUND_ELT(k, s)   do { \
-		if ((bj += 3) == 39) \
-			bj = 0; \
-		sc->b[bj + s] ^= a ## k; \
-	} while (0)
-
-#define ROUND_SF(j)   do { \
-		size_t bj = (j) * 3; \
-		ROUND_ELT(1, 0); \
-		ROUND_ELT(2, 1); \
-		ROUND_ELT(3, 2); \
-		ROUND_ELT(4, 0); \
-		ROUND_ELT(5, 1); \
-		ROUND_ELT(6, 2); \
-		ROUND_ELT(7, 0); \
-		ROUND_ELT(8, 1); \
-		ROUND_ELT(9, 2); \
-		ROUND_ELT(10, 0); \
-		ROUND_ELT(11, 1); \
-		ROUND_ELT(12, 2); \
-		MILL; \
-		bj = (j) * 3; \
-		a ## 13 ^= sc->b[bj + 0]; \
-		a ## 14 ^= sc->b[bj + 1]; \
-		a ## 15 ^= sc->b[bj + 2]; \
-	} while (0)
-
-#define INPUT_SF(j, p0, p1, p2)   do { \
-		size_t bj = ((j) + 1) * 3; \
-		if (bj == 39) \
-			bj = 0; \
-		sc->b[bj + 0] ^= (p0); \
-		sc->b[bj + 1] ^= (p1); \
-		sc->b[bj + 2] ^= (p2); \
-		a16 ^= (p0); \
-		a17 ^= (p1); \
-		a18 ^= (p2); \
-	} while (0)
-
-
-#if SPH_SMALL_FOOTPRINT_RADIOGATUN
-
-#define ROUND   ROUND_SF
-#define INPUT   INPUT_SF
-
-#else
-
-/*
- * Round function R, on base j. The value j is such that B[0] is actually
- * b[j] after the initial rotation. On the 13-round macro, j has the
- * successive values 12, 11, 10... 1, 0.
- */
-#define ROUND(j)   do { \
-		ACC_b(M13_A(1, j), 0) ^= a ## 1; \
-		ACC_b(M13_A(2, j), 1) ^= a ## 2; \
-		ACC_b(M13_A(3, j), 2) ^= a ## 3; \
-		ACC_b(M13_A(4, j), 0) ^= a ## 4; \
-		ACC_b(M13_A(5, j), 1) ^= a ## 5; \
-		ACC_b(M13_A(6, j), 2) ^= a ## 6; \
-		ACC_b(M13_A(7, j), 0) ^= a ## 7; \
-		ACC_b(M13_A(8, j), 1) ^= a ## 8; \
-		ACC_b(M13_A(9, j), 2) ^= a ## 9; \
-		ACC_b(M13_A(10, j), 0) ^= a ## 10; \
-		ACC_b(M13_A(11, j), 1) ^= a ## 11; \
-		ACC_b(M13_A(12, j), 2) ^= a ## 12; \
-		MILL; \
-		a ## 13 ^= ACC_b(j, 0); \
-		a ## 14 ^= ACC_b(j, 1); \
-		a ## 15 ^= ACC_b(j, 2); \
-	} while (0)
-
-#define INPUT(j, p0, p1, p2)   do { \
-		ACC_b(M13_A(1, j), 0) ^= (p0); \
-		ACC_b(M13_A(1, j), 1) ^= (p1); \
-		ACC_b(M13_A(1, j), 2) ^= (p2); \
-		a16 ^= (p0); \
-		a17 ^= (p1); \
-		a18 ^= (p2); \
-	} while (0)
-
-#endif
-
-#define MUL13(action)   do { \
-		action(0); \
-		action(1); \
-		action(2); \
-		action(3); \
-		action(4); \
-		action(5); \
-		action(6); \
-		action(7); \
-		action(8); \
-		action(9); \
-		action(10); \
-		action(11); \
-		action(12); \
-	} while (0)
-
-#define MILL_READ_ELT(i)   do { \
-		a ## i = sc->a[i]; \
-	} while (0)
-
-#define MILL_WRITE_ELT(i)   do { \
-		sc->a[i] = a ## i; \
-	} while (0)
-
-#define STATE_READ_SF   do { \
-		MUL19(MILL_READ_ELT); \
-	} while (0)
-
-#define STATE_WRITE_SF   do { \
-		MUL19(MILL_WRITE_ELT); \
-	} while (0)
-
-#define PUSH13_SF   do { \
-		WT DECL19(a); \
-		const unsigned char *buf; \
- \
-		buf = data; \
-		STATE_READ_SF; \
-		while (len >= sizeof sc->data) { \
-			size_t mk; \
-			for (mk = 13; mk > 0; mk --) { \
-				WT p0 = INW(0, 0); \
-				WT p1 = INW(0, 1); \
-				WT p2 = INW(0, 2); \
-				INPUT_SF(mk - 1, p0, p1, p2); \
-				ROUND_SF(mk - 1); \
-				buf += (sizeof sc->data) / 13; \
-				len -= (sizeof sc->data) / 13; \
-			} \
-		} \
-		STATE_WRITE_SF; \
-		return len; \
-	} while (0)
-
-#if SPH_SMALL_FOOTPRINT_RADIOGATUN
-
-#define STATE_READ    STATE_READ_SF
-#define STATE_WRITE   STATE_WRITE_SF
-#define PUSH13        PUSH13_SF
-
-#else
-
-#define BELT_READ_ELT(i)   do { \
-		b ## i ## _0 = sc->b[3 * i + 0]; \
-		b ## i ## _1 = sc->b[3 * i + 1]; \
-		b ## i ## _2 = sc->b[3 * i + 2]; \
-	} while (0)
-
-#define BELT_WRITE_ELT(i)   do { \
-		sc->b[3 * i + 0] = b ## i ## _0; \
-		sc->b[3 * i + 1] = b ## i ## _1; \
-		sc->b[3 * i + 2] = b ## i ## _2; \
-	} while (0)
-
-#define STATE_READ   do { \
-		MUL13(BELT_READ_ELT); \
-		MUL19(MILL_READ_ELT); \
-	} while (0)
-
-#define STATE_WRITE   do { \
-		MUL13(BELT_WRITE_ELT); \
-		MUL19(MILL_WRITE_ELT); \
-	} while (0)
-
-/*
- * Input data by chunks of 13*3 blocks. This is the body of the
- * radiogatun32_push13() and radiogatun64_push13() functions.
- */
-#define PUSH13   do { \
-		WT DECL19(a), DECL13(b); \
-		const unsigned char *buf; \
- \
-		buf = data; \
-		STATE_READ; \
-		while (len >= sizeof sc->data) { \
-			WT p0, p1, p2; \
-			MUL13(PUSH13_ELT); \
-			buf += sizeof sc->data; \
-			len -= sizeof sc->data; \
-		} \
-		STATE_WRITE; \
-		return len; \
-	} while (0)
-
-#define PUSH13_ELT(k)   do { \
-		p0 = INW(k, 0); \
-		p1 = INW(k, 1); \
-		p2 = INW(k, 2); \
-		INPUT(M13_N(k), p0, p1, p2); \
-		ROUND(M13_N(k)); \
-	} while (0)
-
-#endif
-
-#define BLANK13_SF   do { \
-		size_t mk = 13; \
-		while (mk -- > 0) \
-			ROUND_SF(mk); \
-	} while (0)
-
-#define BLANK1_SF   do { \
-		WT tmp0, tmp1, tmp2; \
-		ROUND_SF(12); \
-		tmp0 = sc->b[36]; \
-		tmp1 = sc->b[37]; \
-		tmp2 = sc->b[38]; \
-		memmove(sc->b + 3, sc->b, 36 * sizeof sc->b[0]); \
-		sc->b[0] = tmp0; \
-		sc->b[1] = tmp1; \
-		sc->b[2] = tmp2; \
-	} while (0)
-
-#if SPH_SMALL_FOOTPRINT_RADIOGATUN
-
-#define BLANK13   BLANK13_SF
-#define BLANK1    BLANK1_SF
-
-#else
-
-/*
- * Run 13 blank rounds. This macro expects the "a" and "b" state variables
- * to be alread declared.
- */
-#define BLANK13   MUL13(BLANK13_ELT)
-
-#define BLANK13_ELT(k)   ROUND(M13_N(k))
-
-#define MUL12(action)   do { \
-		action(0); \
-		action(1); \
-		action(2); \
-		action(3); \
-		action(4); \
-		action(5); \
-		action(6); \
-		action(7); \
-		action(8); \
-		action(9); \
-		action(10); \
-		action(11); \
-	} while (0)
-
-/*
- * Run a single blank round, and physically rotate the belt. This is used
- * for the last blank rounds, and the output rounds. This macro expects the
- * "a" abd "b" state variables to be already declared.
- */
-#define BLANK1   do { \
-		WT tmp0, tmp1, tmp2; \
-		ROUND(12); \
-		tmp0 = b0_0; \
-		tmp1 = b0_1; \
-		tmp2 = b0_2; \
-		MUL12(BLANK1_ELT); \
-		b1_0 = tmp0; \
-		b1_1 = tmp1; \
-		b1_2 = tmp2; \
-	} while (0)
-
-#define BLANK1_ELT(i)   do { \
-		ACC_b(M13_A(M13_N(i), 1), 0) = ACC_b(M13_N(i), 0); \
-		ACC_b(M13_A(M13_N(i), 1), 1) = ACC_b(M13_N(i), 1); \
-		ACC_b(M13_A(M13_N(i), 1), 2) = ACC_b(M13_N(i), 2); \
-	} while (0)
-
-#endif
-
-#define NO_TOKEN
-
-/*
- * Perform padding, then blank rounds, then output some words. This is
- * the body of sph_radiogatun32_close() and sph_radiogatun64_close().
- */
-#define CLOSE_SF(width)   CLOSE_GEN(width, \
-                          NO_TOKEN, STATE_READ_SF, BLANK1_SF, BLANK13_SF)
-
-#if SPH_SMALL_FOOTPRINT_RADIOGATUN
-#define CLOSE          CLOSE_SF
-#else
-#define CLOSE(width)   CLOSE_GEN(width, \
-                       WT DECL13(b);, STATE_READ, BLANK1, BLANK13)
-#endif
-
-#define CLOSE_GEN(width, WTb13, state_read, blank1, blank13)   do { \
-		unsigned ptr, num; \
-		unsigned char *out; \
-		WT DECL19(a); \
-		WTb13 \
- \
-		ptr = sc->data_ptr; \
-		sc->data[ptr ++] = 0x01; \
-		memset(sc->data + ptr, 0, (sizeof sc->data) - ptr); \
-		radiogatun ## width ## _push13(sc, sc->data, sizeof sc->data); \
- \
-		num = 17; \
-		for (;;) { \
-			ptr += 3 * (width >> 3); \
-			if (ptr > sizeof sc->data) \
-				break; \
-			num --; \
-		} \
- \
-		state_read; \
-		if (num >= 13) { \
-			blank13; \
-			num -= 13; \
-		} \
-		while (num -- > 0) \
-			blank1; \
- \
-		num = 0; \
-		out = dst; \
-		for (;;) { \
-			OUTW(out, a1); \
-			out += width >> 3; \
-			OUTW(out, a2); \
-			out += width >> 3; \
-			num += 2 * (width >> 3); \
-			if (num >= 32) \
-				break; \
-			blank1; \
-		} \
-		INIT; \
-	} while (0)
-
-/*
- * Initialize context structure.
- */
-#if SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN
-
-#define INIT   do { \
-		memset(sc->a, 0, sizeof sc->a); \
-		memset(sc->b, 0, sizeof sc->b); \
-		sc->data_ptr = 0; \
-	} while (0)
-
-#else
-
-#define INIT   do { \
-		size_t u; \
-		for (u = 0; u < 19; u ++) \
-			sc->a[u] = 0; \
-		for (u = 0; u < 39; u ++) \
-			sc->b[u] = 0; \
-		sc->data_ptr = 0; \
-	} while (0)
-
-#endif
-
-/* ======================================================================= */
-/*
- * RadioGatun[32].
- */
-
-#if !SPH_NO_RG32
-
-#undef WT
-#define WT           sph_u32
-#undef T
-#define T            SPH_T32
-#undef ROR
-#define ROR(x, n)    SPH_T32(((x) << ((32 - (n)) & 31)) | ((x) >> ((n) & 31)))
-#undef INW
-#define INW(i, j)    sph_dec32le_aligned(buf + (4 * (3 * (i) + (j))))
-#undef OUTW
-#define OUTW(b, v)   sph_enc32le(b, v)
-
-/*
- * Insert data by big chunks of 13*12 = 156 bytes. Returned value is the
- * number of remaining bytes (between 0 and 155). This method assumes that
- * the input data is suitably aligned.
- */
-static size_t
-radiogatun32_push13(sph_radiogatun32_context *sc, const void *data, size_t len)
-{
-	PUSH13;
-}
-
-/* see sph_radiogatun.h */
-void
-sph_radiogatun32_init(void *cc)
-{
-	sph_radiogatun32_context *sc;
-
-	sc = cc;
-	INIT;
-}
-
-#ifdef SPH_UPTR
-static void
-radiogatun32_short(void *cc, const void *data, size_t len)
-#else
-/* see sph_radiogatun.h */
-void
-sph_radiogatun32(void *cc, const void *data, size_t len)
-#endif
-{
-	sph_radiogatun32_context *sc;
-	unsigned ptr;
-
-	sc = cc;
-	ptr = sc->data_ptr;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof sc->data) - ptr;
-		if (clen > len)
-			clen = len;
-		memcpy(sc->data + ptr, data, clen);
-		data = (const unsigned char *)data + clen;
-		len -= clen;
-		ptr += clen;
-		if (ptr == sizeof sc->data) {
-			radiogatun32_push13(sc, sc->data, sizeof sc->data);
-			ptr = 0;
-		}
-	}
-	sc->data_ptr = ptr;
-}
-
-#ifdef SPH_UPTR
-/* see sph_radiogatun.h */
-void
-sph_radiogatun32(void *cc, const void *data, size_t len)
-{
-	sph_radiogatun32_context *sc;
-	unsigned ptr;
-	size_t rlen;
-
-	if (len < (2 * sizeof sc->data)) {
-		radiogatun32_short(cc, data, len);
-		return;
-	}
-	sc = cc;
-	ptr = sc->data_ptr;
-	if (ptr > 0) {
-		unsigned t;
-
-		t = (sizeof sc->data) - ptr;
-		radiogatun32_short(sc, data, t);
-		data = (const unsigned char *)data + t;
-		len -= t;
-	}
-#if !SPH_UNALIGNED
-	if (((SPH_UPTR)data & 3) != 0) {
-		radiogatun32_short(sc, data, len);
-		return;
-	}
-#endif
-	rlen = radiogatun32_push13(sc, data, len);
-	memcpy(sc->data, (const unsigned char *)data + len - rlen, rlen);
-	sc->data_ptr = rlen;
-}
-#endif
-
-/* see sph_radiogatun.h */
-void
-sph_radiogatun32_close(void *cc, void *dst)
-{
-	sph_radiogatun32_context *sc;
-
-	sc = cc;
-	CLOSE(32);
-}
-
-#endif
-
-/* ======================================================================= */
-/*
- * RadioGatun[64]. Compiled only if a 64-bit or more type is available.
- */
-
-#if SPH_64
-
-#if !SPH_NO_RG64
-
-#undef WT
-#define WT           sph_u64
-#undef T
-#define T            SPH_T64
-#undef ROR
-#define ROR(x, n)    SPH_T64(((x) << ((64 - (n)) & 63)) | ((x) >> ((n) & 63)))
-#undef INW
-#define INW(i, j)    sph_dec64le_aligned(buf + (8 * (3 * (i) + (j))))
-#undef OUTW
-#define OUTW(b, v)   sph_enc64le(b, v)
-
-/*
- * On 32-bit x86, register pressure is such that using the small
- * footprint version is a net gain (x2 speed), because that variant
- * uses fewer local variables.
- */
-#if SPH_I386_MSVC || SPH_I386_GCC || defined __i386__
-#undef PUSH13
-#define PUSH13   PUSH13_SF
-#undef CLOSE
-#define CLOSE    CLOSE_SF
-#endif
-
-/*
- * Insert data by big chunks of 13*24 = 312 bytes. Returned value is the
- * number of remaining bytes (between 0 and 311). This method assumes that
- * the input data is suitably aligned.
- */
-static size_t
-radiogatun64_push13(sph_radiogatun64_context *sc, const void *data, size_t len)
-{
-	PUSH13;
-}
-
-/* see sph_radiogatun.h */
-void
-sph_radiogatun64_init(void *cc)
-{
-	sph_radiogatun64_context *sc;
-
-	sc = cc;
-	INIT;
-}
-
-#ifdef SPH_UPTR
-static void
-radiogatun64_short(void *cc, const void *data, size_t len)
-#else
-/* see sph_radiogatun.h */
-void
-sph_radiogatun64(void *cc, const void *data, size_t len)
-#endif
-{
-	sph_radiogatun64_context *sc;
-	unsigned ptr;
-
-	sc = cc;
-	ptr = sc->data_ptr;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof sc->data) - ptr;
-		if (clen > len)
-			clen = len;
-		memcpy(sc->data + ptr, data, clen);
-		data = (const unsigned char *)data + clen;
-		len -= clen;
-		ptr += clen;
-		if (ptr == sizeof sc->data) {
-			radiogatun64_push13(sc, sc->data, sizeof sc->data);
-			ptr = 0;
-		}
-	}
-	sc->data_ptr = ptr;
-}
-
-#ifdef SPH_UPTR
-/* see sph_radiogatun.h */
-void
-sph_radiogatun64(void *cc, const void *data, size_t len)
-{
-	sph_radiogatun64_context *sc;
-	unsigned ptr;
-	size_t rlen;
-
-	if (len < (2 * sizeof sc->data)) {
-		radiogatun64_short(cc, data, len);
-		return;
-	}
-	sc = cc;
-	ptr = sc->data_ptr;
-	if (ptr > 0) {
-		unsigned t;
-
-		t = (sizeof sc->data) - ptr;
-		radiogatun64_short(sc, data, t);
-		data = (const unsigned char *)data + t;
-		len -= t;
-	}
-#if !SPH_UNALIGNED
-	if (((SPH_UPTR)data & 7) != 0) {
-		radiogatun64_short(sc, data, len);
-		return;
-	}
-#endif
-	rlen = radiogatun64_push13(sc, data, len);
-	memcpy(sc->data, (const unsigned char *)data + len - rlen, rlen);
-	sc->data_ptr = rlen;
-}
-#endif
-
-/* see sph_radiogatun.h */
-void
-sph_radiogatun64_close(void *cc, void *dst)
-{
-	sph_radiogatun64_context *sc;
-
-	sc = cc;
-	CLOSE(64);
-}
-
-#endif
-
-#endif
diff --git a/algo/radiogatun/sph_radiogatun.h b/algo/radiogatun/sph_radiogatun.h
deleted file mode 100644
index 4e3888c8..00000000
--- a/algo/radiogatun/sph_radiogatun.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
-/**
- * RadioGatun interface.
- *
- * RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
- * and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
- * presented at the Second Cryptographic Hash Workshop, Santa Barbara,
- * August 24-25, 2006. The main Web site, containing that article, the
- * reference code and some test vectors, appears to be currently located
- * at the following URL: http://radiogatun.noekeon.org/
- *
- * The presentation article does not specify endianness or padding. The
- * reference code uses the following conventions, which we also apply
- * here:
- * <ul>
- * <li>The input message is an integral number of sequences of three
- * words. Each word is either a 32-bit of 64-bit word (depending on
- * the version of RadioGatun).</li>
- * <li>Input bytes are decoded into words using little-endian
- * convention.</li>
- * <li>Padding consists of a single bit of value 1, using little-endian
- * convention within bytes (i.e. for a byte-oriented input, a single
- * byte of value 0x01 is appended), then enough bits of value 0 to finish
- * the current block.</li>
- * <li>Output consists of 256 bits. Successive output words are encoded
- * with little-endian convention.</li>
- * </ul>
- * These conventions are very close to those we use for PANAMA, which is
- * a close ancestor or RadioGatun.
- *
- * RadioGatun is actually a family of functions, depending on some
- * internal parameters. We implement here two functions, with a "belt
- * length" of 13, a "belt width" of 3, and a "mill length" of 19. The
- * RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
- * variant uses 64-bit words.
- *
- * Strictly speaking, the name "RadioGatun" should use an acute accent
- * on the "u", which we omitted here to keep strict ASCII-compatibility
- * of this file.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_radiogatun.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_RADIOGATUN_H__
-#define SPH_RADIOGATUN_H__
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-/**
- * Output size (in bits) for RadioGatun[32].
- */
-#define SPH_SIZE_radiogatun32   256
-
-/**
- * This structure is a context for RadioGatun[32] computations: it
- * contains intermediate values and some data from the last entered
- * block. Once a RadioGatun[32] computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running RadioGatun[32]
- * computation can be cloned by copying the context (e.g. with a
- * simple <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char data[156];   /* first field, for alignment */
-	unsigned data_ptr;
-	sph_u32 a[19], b[39];
-#endif
-} sph_radiogatun32_context;
-
-/**
- * Initialize a RadioGatun[32] context. This process performs no
- * memory allocation.
- *
- * @param cc   the RadioGatun[32] context (pointer to a
- *             <code>sph_radiogatun32_context</code>)
- */
-void sph_radiogatun32_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the RadioGatun[32] context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_radiogatun32(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current RadioGatun[32] computation and output the
- * result into the provided buffer. The destination buffer must be wide
- * enough to accomodate the result (32 bytes). The context is
- * automatically reinitialized.
- *
- * @param cc    the RadioGatun[32] context
- * @param dst   the destination buffer
- */
-void sph_radiogatun32_close(void *cc, void *dst);
-
-#if SPH_64
-
-/**
- * Output size (in bits) for RadioGatun[64].
- */
-#define SPH_SIZE_radiogatun64   256
-
-/**
- * This structure is a context for RadioGatun[64] computations: it
- * contains intermediate values and some data from the last entered
- * block. Once a RadioGatun[64] computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running RadioGatun[64]
- * computation can be cloned by copying the context (e.g. with a
- * simple <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char data[312];   /* first field, for alignment */
-	unsigned data_ptr;
-	sph_u64 a[19], b[39];
-#endif
-} sph_radiogatun64_context;
-
-/**
- * Initialize a RadioGatun[64] context. This process performs no
- * memory allocation.
- *
- * @param cc   the RadioGatun[64] context (pointer to a
- *             <code>sph_radiogatun64_context</code>)
- */
-void sph_radiogatun64_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the RadioGatun[64] context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_radiogatun64(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current RadioGatun[64] computation and output the
- * result into the provided buffer. The destination buffer must be wide
- * enough to accomodate the result (32 bytes). The context is
- * automatically reinitialized.
- *
- * @param cc    the RadioGatun[64] context
- * @param dst   the destination buffer
- */
-void sph_radiogatun64_close(void *cc, void *dst);
-
-#endif
-
-#endif
diff --git a/algo/x20/x20r-gate.c b/algo/x20/x20r-gate.c
deleted file mode 100644
index 48aa85b3..00000000
--- a/algo/x20/x20r-gate.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "x20r-gate.h"
-
-void getAlgoString( const uint8_t* prevblock, char *output )
-{
-    char *sptr = outpuit;
-
-    for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
-    {
-        char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
-        uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
-        if (algoDigit >= 10)
-            sprintf(sptr, "%c", 'A' + (algoDigit - 10));
-         else
-            sprintf(sptr, "%u", (uint32_t) algoDigit);
-        sptr++;
-     }
-     *sptr = '\0';
-}
-
-bool register_x20r_algo( algo_gate_t* gate )
-{
-#if defined (X20R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x20r_4way;
-  gate->hash      = (void*)&x20r_4way_hash;
-#else
-  gate->scanhash  = (void*)&scanhash_x20r;
-  gate->hash      = (void*)&x20r_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
-  opt_target_factor = 256.;
-  return true;
-};
-
diff --git a/algo/x20/x20r-gate.h b/algo/x20/x20r-gate.h
deleted file mode 100644
index 07e975ad..00000000
--- a/algo/x20/x20r-gate.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef X20R_GATE_H__
-#define X20R_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-/*
-#if defined(__AVX2__) && defined(__AES__)
-  #define X20R_4WAY
-#endif
-*/
-
-enum x20r_Algo {
-        BLAKE = 0,
-        BMW,
-        GROESTL,
-        JH,
-        KECCAK,
-        SKEIN,
-        LUFFA,
-        CUBEHASH,
-        SHAVITE,
-        SIMD,
-        ECHO,
-        HAMSI,
-        FUGUE,
-        SHABAL,
-        WHIRLPOOL,
-        SHA_512,
-        HAVAL,      // 256-bits output
-        GOST,
-        RADIOGATUN, // 256-bits output
-        PANAMA,     // 256-bits output
-        X20R_HASH_FUNC_COUNT
-};
-
-void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
-
-void x20r_getAlgoString( const uint8_t* prevblock, char *output );
-
-bool register_xi20r_algo( algo_gate_t* gate );
-
-#if defined(X20R_4WAY)
-
-void x20r_4way_hash( void *state, const void *input );
-
-int scanhash_x20r_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
-void x20rhash( void *state, const void *input );
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
diff --git a/algo/x20/x20r.c b/algo/x20/x20r.c
deleted file mode 100644
index 8c3d43e7..00000000
--- a/algo/x20/x20r.c
+++ /dev/null
@@ -1,252 +0,0 @@
-#include "x20r-gate.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/haval/sph-haval.h"
-#include "algo/radiogatun/sph_radiogatun.h"
-#include "algo/panama/sph_panama.h"
-#include "algo/gost/sph_gost.h"
-#include "algo/sha/sph_sha2.h"
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-#endif 
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
-
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x20r_context_overlay
-{
-    sph_blake512_context     blake;
-    sph_bmw512_context       bmw;
-#if defined(__AES__)
-    hashState_groestl        groestl;
-    hashState_echo           echo;
-#else
-    sph_groestl512_context   groestl;
-    sph_echo512_context      echo;
-#endif
-    sph_skein512_context     skein;
-    sph_jh512_context        jh;
-    sph_keccak512_context    keccak;
-    hashState_luffa          luffa;
-    cubehashParam            cube;
-    hashState_sd             simd;
-    sph_shavite512_context   shavite;
-    sph_hamsi512_context     hamsi;
-    sph_fugue512_context     fugue;
-    sph_shabal512_context    shabal;
-    sph_whirlpool_context    whirlpool;
-    sph_sha512_context       sha512;
-    sph_haval256_5_context   haval;
-    sph_gost512_context      gost;
-    sph_radiogatun64_context radiogatun;
-    sph_panama_context       panama;
-};
-typedef union _x20r_context_overlay x20r_context_overlay;
-
-void x20r_hash(void* output, const void* input)
-{
-   uint32_t _ALIGN(128) hash[64/4];
-   x20r_context_overlay ctx;
-   void *in = (void*) input;
-   int size = 80;
-
-   if ( s_ntime == UINT32_MAX )
-   {
-	const uint8_t* in8 = (uint8_t*) input;
-	x20_r_s_getAlgoString(&in8[4], hashOrder);
-   }
-
-   for (int i = 0; i < 20; i++)
-   {
-	const char elem = hashOrder[i];
-	const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-	switch ( algo )
-       	{
-	   case BLAKE:
-		sph_blake512_init(&ctx.blake);
-		sph_blake512(&ctx.blake, in, size);
-		sph_blake512_close(&ctx.blake, hash);
-		break;
-	   case BMW:
-		sph_bmw512_init(&ctx.bmw);
-		sph_bmw512(&ctx.bmw, in, size);
-		sph_bmw512_close(&ctx.bmw, hash);
-		break;
-	   case GROESTL:
-#if defined(__AES__)
-                init_groestl( &ctx.groestl, 64 );
-                update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                         (const char*)in, size<<3 );
-#else
-                sph_groestl512_init(&ctx.groestl);
-                sph_groestl512(&ctx.groestl, in, size);
-                sph_groestl512_close(&ctx.groestl, hash);
-#endif
-                break;
-           case SKEIN:
-		sph_skein512_init(&ctx.skein);
-		sph_skein512(&ctx.skein, in, size);
-		sph_skein512_close(&ctx.skein, hash);
-		break;
-	   case JH:
-		sph_jh512_init(&ctx.jh);
-		sph_jh512(&ctx.jh, in, size);
-		sph_jh512_close(&ctx.jh, hash);
-		break;
-	   case KECCAK:
-		sph_keccak512_init(&ctx.keccak);
-		sph_keccak512(&ctx.keccak, in, size);
-		sph_keccak512_close(&ctx.keccak, hash);
-		break;
-	   case LUFFA:
-                init_luffa( &ctx.luffa, 512 );
-                update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                        (const BitSequence*)in, size );
-		break;
-           case CUBEHASH:
-                cubehashInit( &ctx.cube, 512, 16, 32 );
-                cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                      (const byte*)in, size );
-		break;
-	   case SHAVITE:
-		sph_shavite512_init(&ctx.shavite);
-		sph_shavite512(&ctx.shavite, in, size);
-		sph_shavite512_close(&ctx.shavite, hash);
-		break;
-           case SIMD:
-                init_sd( &ctx.simd, 512 );
-                update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                 (const BitSequence *)in, size<<3 );
-			break;
-           case ECHO:
-#if defined(__AES__)
-                init_echo( &ctx.echo, 512 );
-                update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                    (const BitSequence *)in, size<<3 );
-#else
-	        sph_echo512_init(&ctx.echo);
-	        sph_echo512(&ctx.echo, in, size);
-	        sph_echo512_close(&ctx.echo, hash);
-#endif
-		break;
-	   case HAMSI:
-		sph_hamsi512_init(&ctx.hamsi);
-		sph_hamsi512(&ctx.hamsi, in, size);
-		sph_hamsi512_close(&ctx.hamsi, hash);
-		break;
-	   case FUGUE:
-		sph_fugue512_init(&ctx.fugue);
-		sph_fugue512(&ctx.fugue, in, size);
-		sph_fugue512_close(&ctx.fugue, hash);
-		break;
-	   case SHABAL:
-		sph_shabal512_init(&ctx.shabal);
-		sph_shabal512(&ctx.shabal, in, size);
-		sph_shabal512_close(&ctx.shabal, hash);
-		break;
-	   case WHIRLPOOL:
-		sph_whirlpool_init(&ctx.whirlpool);
-		sph_whirlpool(&ctx.whirlpool, in, size);
-		sph_whirlpool_close(&ctx.whirlpool, hash);
-		break;
-	   case SHA_512:
-                sph_sha512_Init( &ctx.sha512 );
-                sph_sha512( &ctx.sha512, in, size );
-                sph_sha512_close( &ctx.sha512, hash );
-		break;
-	   case HAVAL:
-		sph_haval256_5_init(&ctx.haval);
-		sph_haval256_5(&ctx.haval, in, size);
-		sph_haval256_5_close(&ctx.haval, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	   case GOST:
-		sph_gost512_init(&ctx.gost);
-		sph_gost512(&ctx.gost, in, size);
-		sph_gost512_close(&ctx.gost, hash);
-		break;
-	   case RADIOGATUN:
-		sph_radiogatun64_init(&ctx.radiogatun);
-		sph_radiogatun64(&ctx.radiogatun, in, size);
-		sph_radiogatun64_close(&ctx.radiogatun, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	   case PANAMA:
-		sph_panama_init(&ctx.panama);
-		sph_panama(&ctx.panama, in, size);
-		sph_panama_close(&ctx.panama, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	}
-   in = (void*) hash;
-   size = 64;
-   }
-   memcpy(output, hash, 32);
-}
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-	           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   for (int k=0; k < 19; k++)
-	be32enc( &endiandata[k], pdata[k] );
-
-   if ( s_ntime != pdata[17] )
-   {
-	uint32_t ntime = swab32(pdata[17]);
-	x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
-	s_ntime = ntime;
-	if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
-   }
-
-   if ( opt_benchmark )
-	ptarget[7] = 0x0cff;
-
-   do {
-	be32enc( &endiandata[19], nonce );
-	x20r_hash( hash32, endiandata );
-
-	if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
-  	{
-        pdata[19] = nonce;
-        submit_solution( work, hash32, mythr );
-	}
-	nonce++;
-
-   } while (nonce < max_nonce && !(*restart));
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
diff --git a/algo/yescrypt/yescrypt-best.c b/algo/yescrypt/yescrypt-best.c
deleted file mode 100644
index 4e836215..00000000
--- a/algo/yescrypt/yescrypt-best.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef __SSE2__
-#include "yescrypt-simd.c"
-#else
-#include "yescrypt-opt.c"
-#endif
diff --git a/algo/yescrypt/yescrypt-platform.h b/algo/yescrypt/yescrypt-platform.h
deleted file mode 100644
index bf6df915..00000000
--- a/algo/yescrypt/yescrypt-platform.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*-
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifdef MAP_ANON
-#include <sys/mman.h>
-#endif
-
-#include "yescrypt.h"
-#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
-
-#ifdef __x86_64__
-#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
-#else
-#undef HUGEPAGE_SIZE
-#endif
-
-/*
-static __inline uint32_t
-le32dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-
-	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
-	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
-}
-
-static __inline void
-le32enc(void *pp, uint32_t x)
-{
-	uint8_t * p = (uint8_t *)pp;
-
-	p[0] = x & 0xff;
-	p[1] = (x >> 8) & 0xff;
-	p[2] = (x >> 16) & 0xff;
-	p[3] = (x >> 24) & 0xff;
-}
-*/
-
-static void *
-alloc_region(yescrypt_region_t * region, size_t size)
-{
-	size_t base_size = size;
-	uint8_t * base, * aligned;
-#ifdef MAP_ANON
-	int flags =
-#ifdef MAP_NOCORE
-	    MAP_NOCORE |
-#endif
-	    MAP_ANON | MAP_PRIVATE;
-#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
-	size_t new_size = size;
-	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
-	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
-		flags |= MAP_HUGETLB;
-/*
- * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
- * huge page size, so let's round up to huge page size here.
- */
-		new_size = size + hugepage_mask;
-		new_size &= ~hugepage_mask;
-	}
-	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
-	if (base != MAP_FAILED) {
-		base_size = new_size;
-	} else
-	if (flags & MAP_HUGETLB) {
-		flags &= ~MAP_HUGETLB;
-		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-	}
-
-#else
-	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-#endif
-	if (base == MAP_FAILED)
-		base = NULL;
-	aligned = base;
-#elif defined(HAVE_POSIX_MEMALIGN)
-	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
-		base = NULL;
-	aligned = base;
-#else
-	base = aligned = NULL;
-	if (size + 63 < size) {
-		errno = ENOMEM;
-	} else if ((base = malloc(size + 63)) != NULL) {
-		aligned = base + 63;
-		aligned -= (uintptr_t)aligned & 63;
-	}
-#endif
-	region->base = base;
-	region->aligned = aligned;
-	region->base_size = base ? base_size : 0;
-	region->aligned_size = base ? size : 0;
-	return aligned;
-}
-
-static __inline void
-init_region(yescrypt_region_t * region)
-{
-	region->base = region->aligned = NULL;
-	region->base_size = region->aligned_size = 0;
-}
-
-static int
-free_region(yescrypt_region_t * region)
-{
-	if (region->base) {
-#ifdef MAP_ANON
-		if (munmap(region->base, region->base_size))
-			return -1;
-#else
-		free(region->base);
-#endif
-	}
-	init_region(region);
-	return 0;
-}
-
-int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen,
-    uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask,
-    uint8_t * buf, size_t buflen)
-{
-	yescrypt_shared1_t* shared1 = &shared->shared1;
-	yescrypt_shared_t dummy, half1, half2;
-	uint8_t salt[32];
-
-	if (flags & YESCRYPT_SHARED_PREALLOCATED) {
-		if (!shared1->aligned || !shared1->aligned_size)
-			return -1;
-	} else {
-		init_region(shared1);
-	}
-	shared->mask1 = 1;
-	if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
-		return 0;
-
-	init_region(&dummy.shared1);
-	dummy.mask1 = 1;
-	if (yescrypt_kdf(&dummy, shared1,
-	    param, paramlen, NULL, 0, N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    salt, sizeof(salt), 0 ) )
-		goto out;
-
-	half1 = half2 = *shared;
-	half1.shared1.aligned_size /= 2;
-	half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size);
-	half2.shared1.aligned_size = half1.shared1.aligned_size;
-	N /= 2;
-
-	if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
-	    salt, sizeof(salt), 0 ))
-		goto out;
-
-	if (yescrypt_kdf(&half2, &half1.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    salt, sizeof(salt), 0))
-		goto out;
-
-	if (yescrypt_kdf(&half1, &half2.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    buf, buflen, 0))
-		goto out;
-
-	shared->mask1 = mask;
-
-	return 0;
-
-out:
-	if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
-		free_region(shared1);
-	return -1;
-}
-
-int
-yescrypt_free_shared(yescrypt_shared_t * shared)
-{
-	return free_region(&shared->shared1);
-}
-
-int
-yescrypt_init_local(yescrypt_local_t * local)
-{
-	init_region(local);
-	return 0;
-}
-
-int
-yescrypt_free_local(yescrypt_local_t * local)
-{
-	return free_region(local);
-}
diff --git a/algo/yescrypt/yescrypt-simd.c b/algo/yescrypt/yescrypt-simd.c
deleted file mode 100644
index 0cbb528d..00000000
--- a/algo/yescrypt/yescrypt-simd.c
+++ /dev/null
@@ -1,1392 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2012-2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-/*
- * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding
- * gcc bug 54349 (fixed for gcc 4.9+).  On 32-bit, it's of direct help.  AVX
- * and XOP are of further help either way.
- */
-/*
-#ifndef __SSE4_1__
-#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance"
-#endif
-*/
-
-#include <emmintrin.h>
-#ifdef __XOP__
-#include <x86intrin.h>
-#endif
-
-#include <errno.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "algo/sha/hmac-sha256-hash.h"
-#include "yescrypt.h"
-#include "yescrypt-platform.h"
-
-#include "compat.h"
-
-#if __STDC_VERSION__ >= 199901L
-/* have restrict */
-#elif defined(__GNUC__)
-#define restrict __restrict
-#else
-#define restrict
-#endif
-
-#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint));
-#define PREFETCH_OUT(x, hint) /* disabled */
-
-#ifdef __XOP__
-#define ARX(out, in1, in2, s) \
-	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
-#else
-#define ARX(out, in1, in2, s) \
-	{ \
-		__m128i T = _mm_add_epi32(in1, in2); \
-		out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \
-		out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \
-	}
-#endif
-
-#define SALSA20_2ROUNDS \
-	/* Operate on "columns" */ \
-	ARX(X1, X0, X3, 7) \
-	ARX(X2, X1, X0, 9) \
-	ARX(X3, X2, X1, 13) \
-	ARX(X0, X3, X2, 18) \
-\
-	/* Rearrange data */ \
-	X1 = _mm_shuffle_epi32(X1, 0x93); \
-	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x39); \
-\
-	/* Operate on "rows" */ \
-	ARX(X3, X0, X1, 7) \
-	ARX(X2, X3, X0, 9) \
-	ARX(X1, X2, X3, 13) \
-	ARX(X0, X1, X2, 18) \
-\
-	/* Rearrange data */ \
-	X1 = _mm_shuffle_epi32(X1, 0x39); \
-	X2 = _mm_shuffle_epi32(X2, 0x4E); \
-	X3 = _mm_shuffle_epi32(X3, 0x93);
-
-/**
- * Apply the salsa20/8 core to the block provided in (X0 ... X3).
- */
-#define SALSA20_8_BASE(maybe_decl, out) \
-	{ \
-		maybe_decl Y0 = X0; \
-		maybe_decl Y1 = X1; \
-		maybe_decl Y2 = X2; \
-		maybe_decl Y3 = X3; \
-		SALSA20_2ROUNDS \
-		SALSA20_2ROUNDS \
-		SALSA20_2ROUNDS \
-		SALSA20_2ROUNDS \
-		(out)[0] = X0 = _mm_add_epi32(X0, Y0); \
-		(out)[1] = X1 = _mm_add_epi32(X1, Y1); \
-		(out)[2] = X2 = _mm_add_epi32(X2, Y2); \
-		(out)[3] = X3 = _mm_add_epi32(X3, Y3); \
-	}
-#define SALSA20_8(out) \
-	SALSA20_8_BASE(__m128i, out)
-
-/**
- * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3).
- */
-#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \
-	X0 = _mm_xor_si128(X0, Z0); \
-	X1 = _mm_xor_si128(X1, Z1); \
-	X2 = _mm_xor_si128(X2, Z2); \
-	X3 = _mm_xor_si128(X3, Z3); \
-	SALSA20_8_BASE(maybe_decl, out)
-
-#define SALSA20_8_XOR_MEM(in, out) \
-	SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out)
-
-#define SALSA20_8_XOR_REG(out) \
-	SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out)
-
-typedef union {
-	uint32_t w[16];
-	__m128i q[4];
-} salsa20_blk_t;
-
-/**
- * blockmix_salsa8(Bin, Bout, r):
- * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
- * bytes in length; the output Bout must also be the same size.
- */
-static inline void
-blockmix_salsa8(const salsa20_blk_t *restrict Bin,
-    salsa20_blk_t *restrict Bout, size_t r)
-{
-	__m128i X0, X1, X2, X3;
-	size_t i;
-
-	r--;
-	PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0)
-	for (i = 0; i < r; i++) {
-		PREFETCH(&Bin[i * 2], _MM_HINT_T0)
-		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-		PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0)
-		PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
-	}
-	PREFETCH(&Bin[r * 2], _MM_HINT_T0)
-	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
-	PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0)
-
-	/* 1: X <-- B_{2r - 1} */
-	X0 = Bin[r * 2 + 1].q[0];
-	X1 = Bin[r * 2 + 1].q[1];
-	X2 = Bin[r * 2 + 1].q[2];
-	X3 = Bin[r * 2 + 1].q[3];
-
-	/* 3: X <-- H(X \xor B_i) */
-	/* 4: Y_i <-- X */
-	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-	SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q)
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < r;) {
-		/* 3: X <-- H(X \xor B_i) */
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q)
-
-		i++;
-
-		/* 3: X <-- H(X \xor B_i) */
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q)
-	}
-
-	/* 3: X <-- H(X \xor B_i) */
-	/* 4: Y_i <-- X */
-	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-	SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q)
-}
-
-/*
- * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
- * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
- * destination registers, whereas the shifts would require an extra move
- * instruction for our code when building without AVX.  Unfortunately, PSHUFD
- * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ)
- * and somewhat slower on some non-Intel CPUs (luckily not including AMD
- * Bulldozer and Piledriver).  Since for many other CPUs using (V)PSHUFD is a
- * win in terms of throughput or/and not needing a move instruction, we
- * currently use it despite of the higher latency on some older CPUs.  As an
- * alternative, the #if below may be patched to only enable use of (V)PSHUFD
- * when building with SSE4.1 or newer, which is not available on older CPUs
- * where this instruction has higher latency.
- */
-#if 1
-#define HI32(X) \
-	_mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1))
-#elif 0
-#define HI32(X) \
-	_mm_srli_si128((X), 4)
-#else
-#define HI32(X) \
-	_mm_srli_epi64((X), 32)
-#endif
-
-#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__))
-/* Intel's name, also supported by recent gcc */
-#define EXTRACT64(X) _mm_cvtsi128_si64(X)
-#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
-/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */
-#define EXTRACT64(X) _mm_cvtsi128_si64x(X)
-#elif defined(__x86_64__) && defined(__SSE4_1__)
-/* No known bugs for this intrinsic */
-#include <smmintrin.h>
-#define EXTRACT64(X) _mm_extract_epi64((X), 0)
-#elif defined(__SSE4_1__)
-/* 32-bit */
-#include <smmintrin.h>
-#if 0
-/* This is currently unused by the code below, which instead uses these two
- * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
-#define EXTRACT64(X) \
-	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
-	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
-#endif
-#else
-/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */
-#define EXTRACT64(X) \
-	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
-	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
-#endif
-
-/* This is tunable */
-#define S_BITS 8
-
-/* Not tunable in this implementation, hard-coded in a few places */
-#define S_SIMD 2
-#define S_P 4
-
-/* Number of S-boxes.  Not tunable by design, hard-coded in a few places. */
-#define S_N 2
-
-/* Derived values.  Not tunable except via S_BITS above. */
-#define S_SIZE1 (1 << S_BITS)
-#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8)
-#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK)
-#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8)
-
-#if !defined(__x86_64__) && defined(__SSE4_1__)
-/* 32-bit with SSE4.1 */
-#define PWXFORM_X_T __m128i
-#define PWXFORM_SIMD(X, x, s0, s1) \
-	x = _mm_and_si128(X, _mm_set1_epi64x(S_MASK2)); \
-	s0 = *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
-	s1 = *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \
-	X = _mm_mul_epu32(HI32(X), X); \
-	X = _mm_add_epi64(X, s0); \
-	X = _mm_xor_si128(X, s1);
-#else
-/* 64-bit, or 32-bit without SSE4.1 */
-#define PWXFORM_X_T uint64_t
-#define PWXFORM_SIMD(X, x, s0, s1) \
-	x = EXTRACT64(X) & S_MASK2; \
-	s0 = *(const __m128i *)(S0 + (uint32_t)x); \
-	s1 = *(const __m128i *)(S1 + (x >> 32)); \
-	X = _mm_mul_epu32(HI32(X), X); \
-	X = _mm_add_epi64(X, s0); \
-	X = _mm_xor_si128(X, s1);
-#endif
-
-#define PWXFORM_ROUND \
-	PWXFORM_SIMD(X0, x0, s00, s01) \
-	PWXFORM_SIMD(X1, x1, s10, s11) \
-	PWXFORM_SIMD(X2, x2, s20, s21) \
-	PWXFORM_SIMD(X3, x3, s30, s31)
-
-#define PWXFORM \
-	{ \
-		PWXFORM_X_T x0, x1, x2, x3; \
-		__m128i s00, s01, s10, s11, s20, s21, s30, s31; \
-		PWXFORM_ROUND PWXFORM_ROUND \
-		PWXFORM_ROUND PWXFORM_ROUND \
-		PWXFORM_ROUND PWXFORM_ROUND \
-	}
-
-#define XOR4(in) \
-	X0 = _mm_xor_si128(X0, (in)[0]); \
-	X1 = _mm_xor_si128(X1, (in)[1]); \
-	X2 = _mm_xor_si128(X2, (in)[2]); \
-	X3 = _mm_xor_si128(X3, (in)[3]);
-
-#define XOUT(out) \
-	(out)[0] = X0; \
-	(out)[1] = X1; \
-	(out)[2] = X2; \
-	(out)[3] = X3;
-
-/**
- * blockmix_pwxform(Bin, Bout, r, S):
- * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin).  The input Bin must
- * be 128r bytes in length; the output Bout must also be the same size.
- */
-static void
-blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout,
-    size_t r, const __m128i *restrict S)
-{
-	const uint8_t * S0, * S1;
-	__m128i X0, X1, X2, X3;
-	size_t i;
-
-	if (!S) {
-		blockmix_salsa8(Bin, Bout, r);
-		return;
-	}
-
-	S0 = (const uint8_t *)S;
-	S1 = (const uint8_t *)S + S_SIZE_ALL / 2;
-
-	/* Convert 128-byte blocks to 64-byte blocks */
-	r *= 2;
-
-	r--;
-	PREFETCH(&Bin[r], _MM_HINT_T0)
-	for (i = 0; i < r; i++) {
-		PREFETCH(&Bin[i], _MM_HINT_T0)
-		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-	}
-	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
-
-	/* X <-- B_{r1 - 1} */
-	X0 = Bin[r].q[0];
-	X1 = Bin[r].q[1];
-	X2 = Bin[r].q[2];
-	X3 = Bin[r].q[3];
-
-	/* for i = 0 to r1 - 1 do */
-	for (i = 0; i < r; i++) {
-		/* X <-- H'(X \xor B_i) */
-		XOR4(Bin[i].q)
-		PWXFORM
-		/* B'_i <-- X */
-		XOUT(Bout[i].q)
-	}
-
-	/* Last iteration of the loop above */
-	XOR4(Bin[i].q)
-	PWXFORM
-
-	/* B'_i <-- H(B'_i) */
-	SALSA20_8(Bout[i].q)
-}
-
-#define XOR4_2(in1, in2) \
-	X0 = _mm_xor_si128((in1)[0], (in2)[0]); \
-	X1 = _mm_xor_si128((in1)[1], (in2)[1]); \
-	X2 = _mm_xor_si128((in1)[2], (in2)[2]); \
-	X3 = _mm_xor_si128((in1)[3], (in2)[3]);
-
-static inline uint32_t
-blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1,
-    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
-    size_t r, int Bin2_in_ROM)
-{
-	__m128i X0, X1, X2, X3;
-	size_t i;
-
-	r--;
-	if (Bin2_in_ROM) {
-		PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA)
-		PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
-		for (i = 0; i < r; i++) {
-			PREFETCH(&Bin2[i * 2], _MM_HINT_NTA)
-			PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
-			PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA)
-			PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
-			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-			PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
-		}
-		PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
-	} else {
-		PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0)
-		PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
-		for (i = 0; i < r; i++) {
-			PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
-			PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
-			PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0)
-			PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
-			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-			PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
-		}
-		PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
-	}
-	PREFETCH(&Bin1[r * 2], _MM_HINT_T0)
-	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
-	PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0)
-
-	/* 1: X <-- B_{2r - 1} */
-	XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
-
-	/* 3: X <-- H(X \xor B_i) */
-	/* 4: Y_i <-- X */
-	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-	XOR4(Bin1[0].q)
-	SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q)
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < r;) {
-		/* 3: X <-- H(X \xor B_i) */
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		XOR4(Bin1[i * 2 + 1].q)
-		SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q)
-
-		i++;
-
-		/* 3: X <-- H(X \xor B_i) */
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		XOR4(Bin1[i * 2].q)
-		SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q)
-	}
-
-	/* 3: X <-- H(X \xor B_i) */
-	/* 4: Y_i <-- X */
-	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-	XOR4(Bin1[r * 2 + 1].q)
-	SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q)
-
-	return _mm_cvtsi128_si32(X0);
-}
-
-static uint32_t
-blockmix_xor(const salsa20_blk_t *restrict Bin1,
-    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
-    size_t r, int Bin2_in_ROM, const __m128i *restrict S)
-{
-	const uint8_t * S0, * S1;
-	__m128i X0, X1, X2, X3;
-	size_t i;
-
-	if (!S)
-		return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM);
-
-	S0 = (const uint8_t *)S;
-	S1 = (const uint8_t *)S + S_SIZE_ALL / 2;
-
-	/* Convert 128-byte blocks to 64-byte blocks */
-	r *= 2;
-
-	r--;
-	if (Bin2_in_ROM) {
-		PREFETCH(&Bin2[r], _MM_HINT_NTA)
-		PREFETCH(&Bin1[r], _MM_HINT_T0)
-		for (i = 0; i < r; i++) {
-			PREFETCH(&Bin2[i], _MM_HINT_NTA)
-			PREFETCH(&Bin1[i], _MM_HINT_T0)
-			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-		}
-	} else {
-		PREFETCH(&Bin2[r], _MM_HINT_T0)
-		PREFETCH(&Bin1[r], _MM_HINT_T0)
-		for (i = 0; i < r; i++) {
-			PREFETCH(&Bin2[i], _MM_HINT_T0)
-			PREFETCH(&Bin1[i], _MM_HINT_T0)
-			PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-		}
-	}
-	PREFETCH_OUT(&Bout[r], _MM_HINT_T0);
-
-	/* X <-- B_{r1 - 1} */
-	XOR4_2(Bin1[r].q, Bin2[r].q)
-
-	/* for i = 0 to r1 - 1 do */
-	for (i = 0; i < r; i++) {
-		/* X <-- H'(X \xor B_i) */
-		XOR4(Bin1[i].q)
-		XOR4(Bin2[i].q)
-		PWXFORM
-		/* B'_i <-- X */
-		XOUT(Bout[i].q)
-	}
-
-	/* Last iteration of the loop above */
-	XOR4(Bin1[i].q)
-	XOR4(Bin2[i].q)
-	PWXFORM
-
-	/* B'_i <-- H(B'_i) */
-	SALSA20_8(Bout[i].q)
-
-	return _mm_cvtsi128_si32(X0);
-}
-
-#undef XOR4
-#define XOR4(in, out) \
-	(out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \
-	(out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \
-	(out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \
-	(out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]);
-
-static inline uint32_t
-blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1,
-    salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
-    size_t r)
-{
-	__m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
-	size_t i;
-
-	r--;
-	PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0)
-	PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0)
-	for (i = 0; i < r; i++) {
-		PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
-		PREFETCH(&Bin1[i * 2], _MM_HINT_T0)
-		PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0)
-		PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0)
-		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-		PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0)
-	}
-	PREFETCH(&Bin2[r * 2], _MM_HINT_T0)
-	PREFETCH(&Bin1[r * 2], _MM_HINT_T0)
-	PREFETCH_OUT(&Bout[r], _MM_HINT_T0)
-	PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0)
-
-	/* 1: X <-- B_{2r - 1} */
-	XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
-
-	/* 3: X <-- H(X \xor B_i) */
-	/* 4: Y_i <-- X */
-	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-	XOR4(Bin1[0].q, Bin2[0].q)
-	SALSA20_8_XOR_REG(Bout[0].q)
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < r;) {
-		/* 3: X <-- H(X \xor B_i) */
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q)
-		SALSA20_8_XOR_REG(Bout[r + 1 + i].q)
-
-		i++;
-
-		/* 3: X <-- H(X \xor B_i) */
-		/* 4: Y_i <-- X */
-		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-		XOR4(Bin1[i * 2].q, Bin2[i * 2].q)
-		SALSA20_8_XOR_REG(Bout[i].q)
-	}
-
-	/* 3: X <-- H(X \xor B_i) */
-	/* 4: Y_i <-- X */
-	/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
-	XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q)
-	SALSA20_8_XOR_REG(Bout[r * 2 + 1].q)
-
-	return _mm_cvtsi128_si32(X0);
-}
-
-#define XOR4_Y \
-	X0 = _mm_xor_si128(X0, Y0); \
-	X1 = _mm_xor_si128(X1, Y1); \
-	X2 = _mm_xor_si128(X2, Y2); \
-	X3 = _mm_xor_si128(X3, Y3);
-
-static uint32_t
-blockmix_xor_save(const salsa20_blk_t *restrict Bin1,
-    salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
-    size_t r, const __m128i *restrict S)
-{
-	const uint8_t * S0, * S1;
-	__m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
-	size_t i;
-
-	if (!S)
-		return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r);
-
-	S0 = (const uint8_t *)S;
-	S1 = (const uint8_t *)S + S_SIZE_ALL / 2;
-
-	/* Convert 128-byte blocks to 64-byte blocks */
-	r *= 2;
-
-	r--;
-	PREFETCH(&Bin2[r], _MM_HINT_T0)
-	PREFETCH(&Bin1[r], _MM_HINT_T0)
-	for (i = 0; i < r; i++) {
-		PREFETCH(&Bin2[i], _MM_HINT_T0)
-		PREFETCH(&Bin1[i], _MM_HINT_T0)
-		PREFETCH_OUT(&Bout[i], _MM_HINT_T0)
-	}
-	PREFETCH_OUT(&Bout[r], _MM_HINT_T0);
-
-	/* X <-- B_{r1 - 1} */
-	XOR4_2(Bin1[r].q, Bin2[r].q)
-
-	/* for i = 0 to r1 - 1 do */
-	for (i = 0; i < r; i++) {
-		XOR4(Bin1[i].q, Bin2[i].q)
-		/* X <-- H'(X \xor B_i) */
-		XOR4_Y
-		PWXFORM
-		/* B'_i <-- X */
-		XOUT(Bout[i].q)
-	}
-
-	/* Last iteration of the loop above */
-	XOR4(Bin1[i].q, Bin2[i].q)
-	XOR4_Y
-	PWXFORM
-
-	/* B'_i <-- H(B'_i) */
-	SALSA20_8(Bout[i].q)
-
-	return _mm_cvtsi128_si32(X0);
-}
-
-#undef ARX
-#undef SALSA20_2ROUNDS
-#undef SALSA20_8
-#undef SALSA20_8_XOR_ANY
-#undef SALSA20_8_XOR_MEM
-#undef SALSA20_8_XOR_REG
-#undef PWXFORM_SIMD_1
-#undef PWXFORM_SIMD_2
-#undef PWXFORM_ROUND
-#undef PWXFORM
-#undef OUT
-#undef XOR4
-#undef XOR4_2
-#undef XOR4_Y
-
-/**
- * integerify(B, r):
- * Return the result of parsing B_{2r-1} as a little-endian integer.
- */
-static inline uint32_t
-integerify(const salsa20_blk_t * B, size_t r)
-{
-	return B[2 * r - 1].w[0];
-}
-
-/**
- * smix1(B, r, N, flags, V, NROM, shared, XY, S):
- * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 128r bytes in length.  The value N must be even and no
- * smaller than 2.  The array V must be aligned to a multiple of 64 bytes, and
- * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64
- * bytes as well saves cache lines, but might result in cache bank conflicts).
- */
-static void
-smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags,
-    salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared,
-    salsa20_blk_t * XY, void * S)
-{
-	const salsa20_blk_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1;
-	size_t s = 2 * r;
-	salsa20_blk_t * X = V, * Y;
-	uint32_t i, j;
-	size_t k;
-
-	/* 1: X <-- B */
-	/* 3: V_i <-- X */
-	for (k = 0; k < 2 * r; k++) {
-		for (i = 0; i < 16; i++) {
-			X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]);
-		}
-	}
-
-	if (NROM && (VROM_mask & 1)) {
-		uint32_t n;
-		salsa20_blk_t * V_n;
-		const salsa20_blk_t * V_j;
-
-		/* 4: X <-- H(X) */
-		/* 3: V_i <-- X */
-		Y = &V[s];
-		blockmix(X, Y, r, S);
-
-		X = &V[2 * s];
-		if ((1 & VROM_mask) == 1) {
-			/* j <-- Integerify(X) mod NROM */
-			j = integerify(Y, r) & (NROM - 1);
-			V_j = &VROM[j * s];
-
-			/* X <-- H(X \xor VROM_j) */
-			j = blockmix_xor(Y, V_j, X, r, 1, S);
-		} else {
-			/* X <-- H(X) */
-			blockmix(Y, X, r, S);
-			j = integerify(X, r);
-		}
-
-		for (n = 2; n < N; n <<= 1) {
-			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
-
-			V_n = &V[n * s];
-
-			/* 2: for i = 0 to N - 1 do */
-			for (i = 1; i < m; i += 2) {
-				/* j <-- Wrap(Integerify(X), i) */
-				j &= n - 1;
-				j += i - 1;
-				V_j = &V[j * s];
-
-				/* X <-- X \xor V_j */
-				/* 4: X <-- H(X) */
-				/* 3: V_i <-- X */
-				Y = &V_n[i * s];
-				j = blockmix_xor(X, V_j, Y, r, 0, S);
-
-				if (((n + i) & VROM_mask) == 1) {
-					/* j <-- Integerify(X) mod NROM */
-					j &= NROM - 1;
-					V_j = &VROM[j * s];
-				} else {
-					/* j <-- Wrap(Integerify(X), i) */
-					j &= n - 1;
-					j += i;
-					V_j = &V[j * s];
-				}
-
-				/* X <-- H(X \xor VROM_j) */
-				X = &V_n[(i + 1) * s];
-				j = blockmix_xor(Y, V_j, X, r, 1, S);
-			}
-		}
-
-		n >>= 1;
-
-		/* j <-- Wrap(Integerify(X), i) */
-		j &= n - 1;
-		j += N - 2 - n;
-		V_j = &V[j * s];
-
-		/* X <-- X \xor V_j */
-		/* 4: X <-- H(X) */
-		/* 3: V_i <-- X */
-		Y = &V[(N - 1) * s];
-		j = blockmix_xor(X, V_j, Y, r, 0, S);
-
-		if (((N - 1) & VROM_mask) == 1) {
-			/* j <-- Integerify(X) mod NROM */
-			j &= NROM - 1;
-			V_j = &VROM[j * s];
-		} else {
-			/* j <-- Wrap(Integerify(X), i) */
-			j &= n - 1;
-			j += N - 1 - n;
-			V_j = &V[j * s];
-		}
-
-		/* X <-- X \xor V_j */
-		/* 4: X <-- H(X) */
-		X = XY;
-		blockmix_xor(Y, V_j, X, r, 1, S);
-	} else if (flags & YESCRYPT_RW) {
-		uint32_t n;
-		salsa20_blk_t * V_n, * V_j;
-
-		/* 4: X <-- H(X) */
-		/* 3: V_i <-- X */
-		Y = &V[s];
-		blockmix(X, Y, r, S);
-
-		/* 4: X <-- H(X) */
-		/* 3: V_i <-- X */
-		X = &V[2 * s];
-		blockmix(Y, X, r, S);
-		j = integerify(X, r);
-
-		for (n = 2; n < N; n <<= 1) {
-			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
-
-			V_n = &V[n * s];
-
-			/* 2: for i = 0 to N - 1 do */
-			for (i = 1; i < m; i += 2) {
-				Y = &V_n[i * s];
-
-				/* j <-- Wrap(Integerify(X), i) */
-				j &= n - 1;
-				j += i - 1;
-				V_j = &V[j * s];
-
-				/* X <-- X \xor V_j */
-				/* 4: X <-- H(X) */
-				/* 3: V_i <-- X */
-				j = blockmix_xor(X, V_j, Y, r, 0, S);
-
-				/* j <-- Wrap(Integerify(X), i) */
-				j &= n - 1;
-				j += i;
-				V_j = &V[j * s];
-
-				/* X <-- X \xor V_j */
-				/* 4: X <-- H(X) */
-				/* 3: V_i <-- X */
-				X = &V_n[(i + 1) * s];
-				j = blockmix_xor(Y, V_j, X, r, 0, S);
-			}
-		}
-
-		n >>= 1;
-
-		/* j <-- Wrap(Integerify(X), i) */
-		j &= n - 1;
-		j += N - 2 - n;
-		V_j = &V[j * s];
-
-		/* X <-- X \xor V_j */
-		/* 4: X <-- H(X) */
-		/* 3: V_i <-- X */
-		Y = &V[(N - 1) * s];
-		j = blockmix_xor(X, V_j, Y, r, 0, S);
-
-		/* j <-- Wrap(Integerify(X), i) */
-		j &= n - 1;
-		j += N - 1 - n;
-		V_j = &V[j * s];
-
-		/* X <-- X \xor V_j */
-		/* 4: X <-- H(X) */
-		X = XY;
-		blockmix_xor(Y, V_j, X, r, 0, S);
-	} else {
-		/* 2: for i = 0 to N - 1 do */
-		for (i = 1; i < N - 1; i += 2) {
-			/* 4: X <-- H(X) */
-			/* 3: V_i <-- X */
-			Y = &V[i * s];
-			blockmix(X, Y, r, S);
-
-			/* 4: X <-- H(X) */
-			/* 3: V_i <-- X */
-			X = &V[(i + 1) * s];
-			blockmix(Y, X, r, S);
-		}
-
-		/* 4: X <-- H(X) */
-		/* 3: V_i <-- X */
-		Y = &V[i * s];
-		blockmix(X, Y, r, S);
-
-		/* 4: X <-- H(X) */
-		X = XY;
-		blockmix(Y, X, r, S);
-	}
-
-	/* B' <-- X */
-	for (k = 0; k < 2 * r; k++) {
-		for (i = 0; i < 16; i++) {
-			le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]);
-		}
-	}
-}
-
-/**
- * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S):
- * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
- * length; the temporary storage V must be 128rN bytes in length; the temporary
- * storage XY must be 256r bytes in length.  The value N must be a power of 2
- * greater than 1.  The value Nloop must be even.  The array V must be aligned
- * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16
- * bytes (aligning them to 64 bytes as well saves cache lines, but might result
- * in cache bank conflicts).
- */
-static void
-smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop,
-    yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM,
-    const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S)
-{
-	const salsa20_blk_t * VROM = shared->shared1.aligned;
-	uint32_t VROM_mask = shared->mask1;
-	size_t s = 2 * r;
-	salsa20_blk_t * X = XY, * Y = &XY[s];
-	uint64_t i;
-	uint32_t j;
-	size_t k;
-
-	if (Nloop == 0)
-		return;
-
-	/* X <-- B' */
-	/* 3: V_i <-- X */
-	for (k = 0; k < 2 * r; k++) {
-		for (i = 0; i < 16; i++) {
-			X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]);
-		}
-	}
-
-	i = Nloop / 2;
-
-	/* 7: j <-- Integerify(X) mod N */
-	j = integerify(X, r) & (N - 1);
-
-/*
- * Normally, NROM implies YESCRYPT_RW, but we check for these separately
- * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls
- * operating on the entire V.
- */
-	if (NROM && (flags & YESCRYPT_RW)) {
-		/* 6: for i = 0 to N - 1 do */
-		for (i = 0; i < Nloop; i += 2) {
-			salsa20_blk_t * V_j = &V[j * s];
-
-			/* 8: X <-- H(X \xor V_j) */
-			/* V_j <-- Xprev \xor V_j */
-			/* j <-- Integerify(X) mod NROM */
-			j = blockmix_xor_save(X, V_j, Y, r, S);
-
-			if (((i + 1) & VROM_mask) == 1) {
-				const salsa20_blk_t * VROM_j;
-
-				j &= NROM - 1;
-				VROM_j = &VROM[j * s];
-
-				/* X <-- H(X \xor VROM_j) */
-				/* 7: j <-- Integerify(X) mod N */
-				j = blockmix_xor(Y, VROM_j, X, r, 1, S);
-			} else {
-				j &= N - 1;
-				V_j = &V[j * s];
-
-				/* 8: X <-- H(X \xor V_j) */
-				/* V_j <-- Xprev \xor V_j */
-				/* j <-- Integerify(X) mod NROM */
-				j = blockmix_xor_save(Y, V_j, X, r, S);
-			}
-			j &= N - 1;
-			V_j = &V[j * s];
-		}
-	} else if (NROM) {
-		/* 6: for i = 0 to N - 1 do */
-		for (i = 0; i < Nloop; i += 2) {
-			const salsa20_blk_t * V_j = &V[j * s];
-
-			/* 8: X <-- H(X \xor V_j) */
-			/* V_j <-- Xprev \xor V_j */
-			/* j <-- Integerify(X) mod NROM */
-			j = blockmix_xor(X, V_j, Y, r, 0, S);
-
-			if (((i + 1) & VROM_mask) == 1) {
-				j &= NROM - 1;
-				V_j = &VROM[j * s];
-			} else {
-				j &= N - 1;
-				V_j = &V[j * s];
-			}
-
-			/* X <-- H(X \xor VROM_j) */
-			/* 7: j <-- Integerify(X) mod N */
-			j = blockmix_xor(Y, V_j, X, r, 1, S);
-			j &= N - 1;
-			V_j = &V[j * s];
-		}
-	} else if (flags & YESCRYPT_RW) {
-		/* 6: for i = 0 to N - 1 do */
-		do {
-			salsa20_blk_t * V_j = &V[j * s];
-
-			/* 8: X <-- H(X \xor V_j) */
-			/* V_j <-- Xprev \xor V_j */
-			/* 7: j <-- Integerify(X) mod N */
-			j = blockmix_xor_save(X, V_j, Y, r, S);
-			j &= N - 1;
-			V_j = &V[j * s];
-
-			/* 8: X <-- H(X \xor V_j) */
-			/* V_j <-- Xprev \xor V_j */
-			/* 7: j <-- Integerify(X) mod N */
-			j = blockmix_xor_save(Y, V_j, X, r, S);
-			j &= N - 1;
-		} while (--i);
-	} else {
-		/* 6: for i = 0 to N - 1 do */
-		do {
-			const salsa20_blk_t * V_j = &V[j * s];
-
-			/* 8: X <-- H(X \xor V_j) */
-			/* 7: j <-- Integerify(X) mod N */
-			j = blockmix_xor(X, V_j, Y, r, 0, S);
-			j &= N - 1;
-			V_j = &V[j * s];
-
-			/* 8: X <-- H(X \xor V_j) */
-			/* 7: j <-- Integerify(X) mod N */
-			j = blockmix_xor(Y, V_j, X, r, 0, S);
-			j &= N - 1;
-		} while (--i);
-	}
-
-	/* 10: B' <-- X */
-	for (k = 0; k < 2 * r; k++) {
-		for (i = 0; i < 16; i++) {
-			le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]);
-		}
-	}
-}
-
-/**
- * p2floor(x):
- * Largest power of 2 not greater than argument.
- */
-static uint64_t
-p2floor(uint64_t x)
-{
-	uint64_t y;
-	while ((y = x & (x - 1)))
-		x = y;
-	return x;
-}
-
-/**
- * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S):
- * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
- * temporary storage V must be 128rN bytes in length; the temporary storage XY
- * must be 256r or 256rp bytes in length (the larger size is required with
- * OpenMP-enabled builds).  The value N must be a power of 2 greater than 1.
- * The array V must be aligned to a multiple of 64 bytes, and arrays B and
- * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well
- * saves cache lines and helps avoid false sharing in OpenMP-enabled builds
- * when p > 1, but it might also result in cache bank conflicts).
- */
-static void
-smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t,
-    yescrypt_flags_t flags,
-    salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared,
-    salsa20_blk_t * XY, void * S)
-{
-	size_t s = 2 * r;
-	uint32_t Nchunk = N / p;
-	uint64_t Nloop_all, Nloop_rw;
-	uint32_t i;
-
-	Nloop_all = Nchunk;
-	if (flags & YESCRYPT_RW) {
-		if (t <= 1) {
-			if (t)
-				Nloop_all *= 2; /* 2/3 */
-			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
-		} else {
-			Nloop_all *= t - 1;
-		}
-	} else if (t) {
-		if (t == 1)
-			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
-		Nloop_all *= t;
-	}
-
-	Nloop_rw = 0;
-	if (flags & __YESCRYPT_INIT_SHARED)
-		Nloop_rw = Nloop_all;
-	else if (flags & YESCRYPT_RW)
-		Nloop_rw = Nloop_all / p;
-
-	Nchunk &= ~(uint32_t)1; /* round down to even */
-	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
-	Nloop_rw &= ~(uint64_t)1; /* round down to even */
-
-#ifdef _OPENMP
-#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw)
-	{
-#pragma omp for
-#endif
-	for (i = 0; i < p; i++) {
-		uint32_t Vchunk = i * Nchunk;
-		uint8_t * Bp = &B[128 * r * i];
-		salsa20_blk_t * Vp = &V[Vchunk * s];
-#ifdef _OPENMP
-		salsa20_blk_t * XYp = &XY[i * (2 * s)];
-#else
-		salsa20_blk_t * XYp = XY;
-#endif
-		uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
-		void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S;
-		if (Sp)
-			smix1(Bp, 1, S_SIZE_ALL / 128,
-			    flags & ~YESCRYPT_PWXFORM,
-			    Sp, NROM, shared, XYp, NULL);
-		if (!(flags & __YESCRYPT_INIT_SHARED_2))
-			smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp);
-		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
-		    NROM, shared, XYp, Sp);
-	}
-
-	if (Nloop_all > Nloop_rw) {
-#ifdef _OPENMP
-#pragma omp for
-#endif
-		for (i = 0; i < p; i++) {
-			uint8_t * Bp = &B[128 * r * i];
-#ifdef _OPENMP
-			salsa20_blk_t * XYp = &XY[i * (2 * s)];
-#else
-			salsa20_blk_t * XYp = XY;
-#endif
-			void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S;
-			smix2(Bp, r, N, Nloop_all - Nloop_rw,
-			    flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp);
-		}
-	}
-#ifdef _OPENMP
-	}
-#endif
-}
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters r, p, and buflen must satisfy
- * r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N must be a power
- * of 2 greater than 1.  (This optimized implementation currently additionally
- * limits N to the range from 8 to 2^31, but other implementation might not.)
- *
- * t controls computation time while not affecting peak memory usage.  shared
- * and flags may request special modes as described in yescrypt.h.  local is
- * the thread-local data structure, allowing to preserve and reuse a memory
- * allocation across calls, thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- */
-int
-yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
-    const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen,
-    uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags,
-    uint8_t * buf, size_t buflen, int thrid )
-{
-	uint8_t _ALIGN(128) sha256[32];
-	yescrypt_region_t tmp;
-	uint64_t NROM;
-	size_t B_size, V_size, XY_size, need;
-	uint8_t * B, * S;
-	salsa20_blk_t * V, * XY;
-   int retval = 1;
-
-	/*
-	 * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose,
-	 * so don't let it have side-effects.  Without this adjustment, it'd
-	 * enable the SHA-256 password pre-hashing and output post-hashing,
-	 * because any deviation from classic scrypt implies those.
-	 */
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	/* Sanity-check parameters */
-	if (flags & ~YESCRYPT_KNOWN_FLAGS) {
-		errno = EINVAL;
-		return -1;
-	}
-#if SIZE_MAX > UINT32_MAX
-	if (buflen > (((uint64_t)(1) << 32) - 1) * 32) {
-		errno = EFBIG;
-		return -1;
-	}
-#endif
-	if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) {
-		errno = EFBIG;
-		return -1;
-	}
-	if (N > UINT32_MAX) {
-		errno = EFBIG;
-		return -1;
-	}
-	if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) {
-		errno = EINVAL;
-		return -1;
-	}
-	if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) {
-		errno = EINVAL;
-		return -1;
-	}
-	if ((r > SIZE_MAX / 256 / p) ||
-	    (N > SIZE_MAX / 128 / r)) {
-		errno = ENOMEM;
-		return -1;
-	}
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX) &&
-	    (N > SIZE_MAX / 128 / (r * p))) {
-		errno = ENOMEM;
-		return -1;
-	}
-#endif
-	if ((flags & YESCRYPT_PWXFORM) &&
-#ifndef _OPENMP
-	    (flags & YESCRYPT_PARALLEL_SMIX) &&
-#endif
-	    p > SIZE_MAX / S_SIZE_ALL) {
-		errno = ENOMEM;
-		return -1;
-	}
-
-	NROM = 0;
-	if (shared->shared1.aligned) {
-		NROM = shared->shared1.aligned_size / ((size_t)128 * r);
-		if (NROM > UINT32_MAX) {
-			errno = EFBIG;
-			return -1;
-		}
-		if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) ||
-		    !(flags & YESCRYPT_RW)) {
-			errno = EINVAL;
-			return -1;
-		}
-	}
-
-	/* Allocate memory */
-	V = NULL;
-	V_size = (size_t)128 * r * N;
-#ifdef _OPENMP
-	if (!(flags & YESCRYPT_PARALLEL_SMIX))
-		V_size *= p;
-#endif
-	need = V_size;
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (local->aligned_size < need) {
-			if (local->base || local->aligned ||
-			    local->base_size || local->aligned_size) {
-				errno = EINVAL;
-				return -1;
-			}
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		V = (salsa20_blk_t *)local->aligned;
-		need = 0;
-	}
-	B_size = (size_t)128 * r * p;
-	need += B_size;
-	if (need < B_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	XY_size = (size_t)256 * r;
-#ifdef _OPENMP
-	XY_size *= p;
-#endif
-	need += XY_size;
-	if (need < XY_size) {
-		errno = ENOMEM;
-		return -1;
-	}
-	if (flags & YESCRYPT_PWXFORM) {
-		size_t S_size = S_SIZE_ALL;
-#ifdef _OPENMP
-		S_size *= p;
-#else
-		if (flags & YESCRYPT_PARALLEL_SMIX)
-			S_size *= p;
-#endif
-		need += S_size;
-		if (need < S_size) {
-			errno = ENOMEM;
-			return -1;
-		}
-	}
-	if (flags & __YESCRYPT_INIT_SHARED) {
-		if (!alloc_region(&tmp, need))
-			return -1;
-		B = (uint8_t *)tmp.aligned;
-		XY = (salsa20_blk_t *)((uint8_t *)B + B_size);
-	} else {
-		init_region(&tmp);
-		if (local->aligned_size < need) {
-			if (free_region(local))
-				return -1;
-			if (!alloc_region(local, need))
-				return -1;
-		}
-		B = (uint8_t *)local->aligned;
-		V = (salsa20_blk_t *)((uint8_t *)B + B_size);
-		XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
-	}
-	S = NULL;
-	if (flags & YESCRYPT_PWXFORM)
-		S = (uint8_t *)XY + XY_size;
-
-	if (t || flags) {
-		SHA256_Buf( passwd, passwdlen, sha256 );
-		passwd = sha256;
-		passwdlen = sizeof(sha256);
-	}
-
-	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size);
-
-   if ( work_restart[thrid].restart ) 
-   { 
-     retval = 0; 
-     goto out;
-   }
-   
-	if (t || flags)
-		memcpy(sha256, B, sizeof(sha256));
-
-	if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) {
-		smix(B, r, N, p, t, flags, V, NROM, shared, XY, S);
-	} else {
-		uint32_t i;
-
-		/* 2: for i = 0 to p - 1 do */
-#ifdef _OPENMP
-#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S)
-#endif
-		for (i = 0; i < p; i++) {
-			/* 3: B_i <-- MF(B_i, N) */
-#ifdef _OPENMP
-			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags,
-			    &V[(size_t)2 * r * i * N],
-			    NROM, shared,
-			    &XY[(size_t)4 * r * i],
-			    S ? &S[S_SIZE_ALL * i] : S);
-#else
-			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V,
-			    NROM, shared, XY, S);
-#endif
-		}
-	}
-
-   if ( work_restart[thrid].restart )
-   {
-     retval = 0;
-     goto out;
-   }
-
-	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen);
-
-   if ( work_restart[thrid].restart ) 
-   { 
-     retval = 0; 
-     goto out;
-   }
-   
-	/*
-	 * Except when computing classic scrypt, allow all computation so far
-	 * to be performed on the client.  The final steps below match those of
-	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
-	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
-	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
-	 */
-	if ((t || flags) && buflen == sizeof(sha256)) {
-	   /* Compute ClientKey */
-	   {
-		HMAC_SHA256_CTX ctx;
-		HMAC_SHA256_Init(&ctx, buf, buflen);
-                if ( yescrypt_client_key )
-                    HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
-                                        yescrypt_client_key_len );
-                else
-                    HMAC_SHA256_Update( &ctx, salt, saltlen );
-		HMAC_SHA256_Final(sha256, &ctx);
-	   }
-	   /* Compute StoredKey */
-	   {
-		SHA256_Buf( sha256, sizeof(sha256), buf );
-	   }
-	}
-
-out:   
-	if (free_region(&tmp))
-		return -1;
-
-	/* Success! */
-	return retval;
-}
diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c
deleted file mode 100644
index 7d3efa1a..00000000
--- a/algo/yescrypt/yescrypt.c
+++ /dev/null
@@ -1,488 +0,0 @@
-/*-
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "compat.h"
-
-#include "yescrypt.h"
-#include "algo/sha/hmac-sha256-hash.h"
-#include "algo-gate-api.h"
-
-#define BYTES2CHARS(bytes) \
-	((((bytes) * 8) + 5) / 6)
-
-#define HASH_SIZE 32 /* bytes */
-#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
-#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
-
-static const char * const itoa64 =
-	"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits)
-{
-	uint32_t bit;
-
-	for (bit = 0; bit < srcbits; bit += 6) {
-		if (dstlen < 1)
-			return NULL;
-		*dst++ = itoa64[src & 0x3f];
-		dstlen--;
-		src >>= 6;
-	}
-
-	return dst;
-}
-
-static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen)
-{
-	size_t i;
-
-	for (i = 0; i < srclen; ) {
-		uint8_t * dnext;
-		uint32_t value = 0, bits = 0;
-		do {
-			value |= (uint32_t)src[i++] << bits;
-			bits += 8;
-		} while (bits < 24 && i < srclen);
-		dnext = encode64_uint32(dst, dstlen, value, bits);
-		if (!dnext)
-			return NULL;
-		dstlen -= dnext - dst;
-		dst = dnext;
-	}
-
-	return dst;
-}
-
-static int decode64_one(uint32_t* dst, uint8_t src)
-{
-	const char * ptr = strchr(itoa64, src);
-	if (ptr) {
-		*dst = (uint32_t) (ptr - itoa64);
-		return 0;
-	}
-	*dst = 0;
-	return -1;
-}
-
-static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src)
-{
-	uint32_t bit;
-	uint32_t value;
-
-	value = 0;
-	for (bit = 0; bit < dstbits; bit += 6) {
-		uint32_t one;
-		if (decode64_one(&one, *src)) {
-			*dst = 0;
-			return NULL;
-		}
-		src++;
-		value |= one << bit;
-	}
-
-	*dst = value;
-	return src;
-}
-
-uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local,
-    const uint8_t* passwd, size_t passwdlen, const uint8_t* setting,
-    uint8_t* buf, size_t buflen, int thrid )
-{
-	uint8_t hash[HASH_SIZE];
-	const uint8_t * src, * salt;
-	uint8_t * dst;
-	size_t prefixlen, saltlen, need;
-	uint8_t version;
-	uint64_t N;
-	uint32_t r, p;
-	yescrypt_flags_t flags = YESCRYPT_WORM;
-
-	printf("pass1 ...");
-	fflush(stdout);
-
-	if (setting[0] != '$' || setting[1] != '7') {
-		printf("died$7 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	printf("died80 ...");
-	fflush(stdout);
-
-	src = setting + 2;
-
-	printf("hello '%p'\n", (char *)src);
-	fflush(stdout);
-
-	switch ((version = *src)) {
-	case '$':
-		printf("died2 ...");
-		fflush(stdout);
-		break;
-	case 'X':
-		src++;
-		flags = YESCRYPT_RW;
-		printf("died3 ...");
-		fflush(stdout);
-		break;
-	default:
-		printf("died4 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	printf("pass2 ...");
-	fflush(stdout);
-
-	if (*src != '$') {
-		uint32_t decoded_flags;
-		if (decode64_one(&decoded_flags, *src)) {
-			printf("died5 ...");
-			fflush(stdout);
-			return NULL;
-		}
-		flags = decoded_flags;
-		if (*++src != '$') {
-			printf("died6 ...");
-			fflush(stdout);
-			return NULL;
-		}
-	}
-
-	src++;
-
-	{
-		uint32_t N_log2;
-		if (decode64_one(&N_log2, *src)) {
-			printf("died7 ...");
-			return NULL;
-		}
-		src++;
-		N = (uint64_t)1 << N_log2;
-	}
-
-	src = decode64_uint32(&r, 30, src);
-	if (!src) {
-		printf("died6 ...");
-		return NULL;
-	}
-
-	src = decode64_uint32(&p, 30, src);
-	if (!src) {
-		printf("died7 ...");
-		return NULL;
-	}
-
-	prefixlen = src - setting;
-
-	salt = src;
-	src = (uint8_t *)strrchr((char *)salt, '$');
-	if (src)
-		saltlen = src - salt;
-	else
-		saltlen = strlen((char *)salt);
-
-	need = prefixlen + saltlen + 1 + HASH_LEN + 1;
-	if (need > buflen || need < saltlen) {
-		printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen);
-		printf("died8killbuf ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	if ( yescrypt_kdf( shared, local, passwd, passwdlen, salt, saltlen, N, r, p,
-            0, flags, hash, sizeof(hash), thrid ) == -1 )
-   {
-		printf("died10 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	dst = buf;
-	memcpy(dst, setting, prefixlen + saltlen);
-	dst += prefixlen + saltlen;
-	*dst++ = '$';
-
-	dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
-	/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
-	 * memory allocations yet anyway. */
-	if (!dst || dst >= buf + buflen) { /* Can't happen */
-		printf("died11 ...");
-		return NULL;
-	}
-
-	*dst = 0; /* NUL termination */
-
-	printf("died12 ...");
-	fflush(stdout);
-
-	return buf;
-}
-
-uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting, int thrid )
-{
-	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
-	yescrypt_shared_t shared;
-	yescrypt_local_t local;
-	uint8_t * retval;
-
-	if (yescrypt_init_shared(&shared, NULL, 0,
-	    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
-		return NULL;
-	if (yescrypt_init_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return NULL;
-	}
-	retval = yescrypt_r(&shared, &local,
-	    passwd, 80, setting, buf, sizeof(buf), thrid );
-	//printf("hashse='%s'\n", (char *)retval);
-	if (yescrypt_free_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return NULL;
-	}
-	if (yescrypt_free_shared(&shared))
-		return NULL;
-	return retval;
-}
-
-uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
-    const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen)
-{
-	uint8_t * dst;
-	size_t prefixlen = 3 + 1 + 5 + 5;
-	size_t saltlen = BYTES2CHARS(srclen);
-	size_t need;
-
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	if (flags) {
-		if (flags & ~0x3f)
-			return NULL;
-
-		prefixlen++;
-		if (flags != YESCRYPT_RW)
-			prefixlen++;
-	}
-
-	need = prefixlen + saltlen + 1;
-	if (need > buflen || need < saltlen || saltlen < srclen)
-		return NULL;
-
-	if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
-		return NULL;
-
-	dst = buf;
-	*dst++ = '$';
-	*dst++ = '7';
-	if (flags) {
-		*dst++ = 'X'; /* eXperimental, subject to change */
-		if (flags != YESCRYPT_RW)
-			*dst++ = itoa64[flags];
-	}
-	*dst++ = '$';
-
-	*dst++ = itoa64[N_log2];
-
-	dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
-	if (!dst) /* Can't happen */
-		return NULL;
-
-	dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
-	if (!dst) /* Can't happen */
-		return NULL;
-
-	dst = encode64(dst, buflen - (dst - buf), src, srclen);
-	if (!dst || dst >= buf + buflen) /* Can't happen */
-		return NULL;
-
-	*dst = 0; /* NUL termination */
-
-	return buf;
-}
-
-uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
-    const uint8_t * src, size_t srclen)
-{
-	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
-	return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
-	    buf, sizeof(buf));
-}
-
-static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
-    uint8_t * buf, size_t buflen, int thrid )
-{
-	static __thread int initialized = 0;
-	static __thread yescrypt_shared_t shared;
-	static __thread yescrypt_local_t local;
-	int retval;
-	if (!initialized) {
-/* "shared" could in fact be shared, but it's simpler to keep it private
- * along with "local".  It's dummy and tiny anyway. */
-		if (yescrypt_init_shared(&shared, NULL, 0,
-		    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
-			return -1;
-		if (yescrypt_init_local(&local)) {
-			yescrypt_free_shared(&shared);
-			return -1;
-		}
-		initialized = 1;
-	}
-	retval = yescrypt_kdf(&shared, &local,
-	    passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
-	    buf, buflen, thrid );
-#if 0
-	if (yescrypt_free_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return -1;
-	}
-	if (yescrypt_free_shared(&shared))
-		return -1;
-	initialized = 0;
-#endif
-	return retval;
-}
-
-// scrypt parameters initialized at run time.
-uint64_t YESCRYPT_N;
-uint32_t YESCRYPT_R;
-uint32_t YESCRYPT_P;
-char *yescrypt_client_key = NULL;
-int yescrypt_client_key_len = 0;
-
-/* main hash 80 bytes input */
-int yescrypt_hash( const char *input, char *output, uint32_t len, int thrid )
-{
-   return yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N,
-                  YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32, thrid );
-}
-
-/* for util.c test */
-int yescrypthash(void *output, const void *input, int thrid)
-{
-	return yescrypt_hash((char*) input, (char*) output, 80, thrid);
-}
-
-int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   uint32_t n = first_nonce;
-   int thr_id = mythr->id; 
-
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-   endiandata[19] = n;
-   do {
-      if ( yescrypt_hash((char*) endiandata, (char*) vhash, 80, thr_id ) )
-      if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
-      {
-          be32enc( pdata+19, n );
-          submit_solution( work, vhash, mythr );
-      }
-      endiandata[19] = ++n;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
-}
-
-void yescrypt_gate_base(algo_gate_t *gate )
-{
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yescrypt;
-   gate->hash       = (void*)&yescrypt_hash;
-   opt_target_factor = 65536.0;
-}
-
-bool register_yescrypt_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-
-   if ( opt_param_n )  YESCRYPT_N = opt_param_n;
-   else                YESCRYPT_N = 2048;
-
-   if ( opt_param_r )  YESCRYPT_R = opt_param_r;
-   else                YESCRYPT_R = 8;
- 
-   if ( opt_param_key ) 
-   {   
-     yescrypt_client_key = opt_param_key;
-     yescrypt_client_key_len = strlen( opt_param_key );
-   }
-   else
-   {   
-     yescrypt_client_key = NULL;
-     yescrypt_client_key_len = 0;
-   }
-
-   YESCRYPT_P = 1;
-
-   applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d", YESCRYPT_N,
-                                                            YESCRYPT_R );
-   if ( yescrypt_client_key )
-     applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key );
-
-   return true;
-}
-
-bool register_yescryptr8_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "Client Key";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 2048;
-   YESCRYPT_R = 8;
-   YESCRYPT_P = 1;
-   return true;
-}
-
-bool register_yescryptr16_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "Client Key";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 4096;   
-   YESCRYPT_R = 16;   
-   YESCRYPT_P = 1;   
-   return true;
-}
-
-bool register_yescryptr32_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "WaviBanana";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 4096;
-   YESCRYPT_R = 32;
-   YESCRYPT_P = 1;
-   return true;
-}
-
diff --git a/algo/yescrypt/yescrypt.h b/algo/yescrypt/yescrypt.h
deleted file mode 100644
index 51be262e..00000000
--- a/algo/yescrypt/yescrypt.h
+++ /dev/null
@@ -1,382 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#ifndef YESCRYPT_H
-#define YESCRYPT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h> /* for size_t */
-#include <stdbool.h>
-#include "miner.h"
-
-//#define  __SSE4_1__
-
-int yescrypt_hash(const char* input, char* output, uint32_t len, int thrid );
-
-int yescrypthash(void *output, const void *input, int thrid );
-
-/**
- * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen) and write the result into buf.  The parameters r, p, and buflen
- * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N
- * must be a power of 2 greater than 1.
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as buf is local to the thread.
- */
-extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __salt, size_t __saltlen,
-    uint64_t __N, uint32_t __r, uint32_t __p,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * Internal type used by the memory allocator.  Please do not use it directly.
- * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
- * they might differ from each other in a future version.
- */
-typedef struct {
-	void * base, * aligned;
-	size_t base_size, aligned_size;
-} yescrypt_region_t;
-
-/**
- * Types for shared (ROM) and thread-local (RAM) data structures.
- */
-typedef yescrypt_region_t yescrypt_shared1_t;
-typedef struct {
-	yescrypt_shared1_t shared1;
-	uint32_t mask1;
-} yescrypt_shared_t;
-typedef yescrypt_region_t yescrypt_local_t;
-
-/**
- * Possible values for yescrypt_init_shared()'s flags argument.
- */
-typedef enum {
-	YESCRYPT_SHARED_DEFAULTS = 0,
-	YESCRYPT_SHARED_PREALLOCATED = 0x100
-} yescrypt_init_shared_flags_t;
-
-/**
- * Possible values for the flags argument of yescrypt_kdf(),
- * yescrypt_gensalt_r(), yescrypt_gensalt().  These may be OR'ed together,
- * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
- * Please refer to the description of yescrypt_kdf() below for the meaning of
- * these flags.
- */
-typedef enum {
-/* public */
-	YESCRYPT_WORM = 0,
-	YESCRYPT_RW = 1,
-	YESCRYPT_PARALLEL_SMIX = 2,
-	YESCRYPT_PWXFORM = 4,
-/* private */
-	__YESCRYPT_INIT_SHARED_1 = 0x10000,
-	__YESCRYPT_INIT_SHARED_2 = 0x20000,
-	__YESCRYPT_INIT_SHARED = 0x30000
-} yescrypt_flags_t;
-
-extern char *yescrypt_client_key;
-extern int yescrypt_client_key_len;
-
-
-#define YESCRYPT_KNOWN_FLAGS \
-	(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
-	__YESCRYPT_INIT_SHARED)
-
-/**
- * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
- *     buf, buflen):
- * Optionally allocate memory for and initialize the shared (ROM) data
- * structure.  The parameters N, r, and p must satisfy the same conditions as
- * with crypto_scrypt().  param and paramlen specify a local parameter with
- * which the ROM is seeded.  If buf is not NULL, then it is used to return
- * buflen bytes of message digest for the initialized ROM (the caller may use
- * this to verify that the ROM has been computed in the same way that it was on
- * a previous run).
- *
- * Return 0 on success; or -1 on error.
- *
- * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
- * ROM is assumed to have been preallocated by the caller, with
- * shared->shared1.aligned being the start address of the ROM and
- * shared->shared1.aligned_size being its size (which must be consistent with
- * N, r, and p).  This may be used e.g. when the ROM is to be placed in a SysV
- * shared memory segment allocated by the caller.
- *
- * mask controls the frequency of ROM accesses by yescrypt_kdf().  Normally it
- * should be set to 1, to interleave RAM and ROM accesses, which works well
- * when both regions reside in the machine's RAM anyway.  Other values may be
- * used e.g. when the ROM is memory-mapped from a disk file.  Recommended mask
- * values are powers of 2 minus 1 or minus 2.  Here's the effect of some mask
- * values:
- * mask	value	ROM accesses in SMix 1st loop	ROM accesses in SMix 2nd loop
- *	0		0				1/2
- *	1		1/2				1/2
- *	2		0				1/4
- *	3		1/4				1/4
- *	6		0				1/8
- *	7		1/8				1/8
- *	14		0				1/16
- *	15		1/16				1/16
- *	1022		0				1/1024
- *	1023		1/1024				1/1024
- *
- * Actual computation of the ROM contents may be avoided, if you don't intend
- * to use a ROM but need a dummy shared structure, by calling this function
- * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
- * arguments starting with param and on.
- *
- * MT-safe as long as shared is local to the thread.
- */
-extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
-    const uint8_t * __param, size_t __paramlen,
-    uint64_t __N, uint32_t __r, uint32_t __p,
-    yescrypt_init_shared_flags_t __flags, uint32_t __mask,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * yescrypt_free_shared(shared):
- * Free memory that had been allocated with yescrypt_init_shared().
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as shared is local to the thread.
- */
-extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
-
-/**
- * yescrypt_init_local(local):
- * Initialize the thread-local (RAM) data structure.  Actual memory allocation
- * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as local is local to the thread.
- */
-extern int yescrypt_init_local(yescrypt_local_t * __local);
-
-/**
- * yescrypt_free_local(local):
- * Free memory that may have been allocated for an initialized thread-local
- * (RAM) data structure.
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as local is local to the thread.
- */
-extern int yescrypt_free_local(yescrypt_local_t * __local);
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters N, r, p, and buflen must satisfy
- * the same conditions as with crypto_scrypt().  t controls computation time
- * while not affecting peak memory usage.  shared and flags may request
- * special modes as described below.  local is the thread-local data
- * structure, allowing to preserve and reuse a memory allocation across calls,
- * thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- *
- * t controls computation time.  t = 0 is optimal in terms of achieving the
- * highest area-time for ASIC attackers.  Thus, higher computation time, if
- * affordable, is best achieved by increasing N rather than by increasing t.
- * However, if the higher memory usage (which goes along with higher N) is not
- * affordable, or if fine-tuning of the time is needed (recall that N must be a
- * power of 2), then t = 1 or above may be used to increase time while staying
- * at the same peak memory usage.  t = 1 increases the time by 25% and
- * decreases the normalized area-time to 96% of optimal.  (Of course, in
- * absolute terms the area-time increases with higher t.  It's just that it
- * would increase slightly more with higher N*r rather than with higher t.)
- * t = 2 increases the time by another 20% and decreases the normalized
- * area-time to 89% of optimal.  Thus, these two values are reasonable to use
- * for fine-tuning.  Values of t higher than 2 result in further increase in
- * time while reducing the efficiency much further (e.g., down to around 50% of
- * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
- * numbers varying by the flags settings).
- *
- * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
- * passing a dummy shared structure (see the description of
- * yescrypt_init_shared() above for how to produce one).  In this mode, the
- * thread-local memory region (RAM) is first sequentially written to and then
- * randomly read from.  This algorithm is friendly towards time-memory
- * tradeoffs (TMTO), available both to defenders (albeit not in this
- * implementation) and to attackers.
- *
- * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
- * memory region (RAM), which makes TMTO a lot less efficient.  This may be
- * used to slow down the kinds of attackers who would otherwise benefit from
- * classic scrypt's efficient TMTO.  Since classic scrypt's TMTO allows not
- * only for the tradeoff, but also for a decrease of attacker's area-time (by
- * up to a constant factor), setting YESCRYPT_RW substantially increases the
- * cost of attacks in area-time terms as well.  Yet another benefit of it is
- * that optimal area-time is reached at an earlier time than with classic
- * scrypt, and t = 0 actually corresponds to this earlier completion time,
- * resulting in quicker hash computations (and thus in higher request rate
- * capacity).  Due to these properties, YESCRYPT_RW should almost always be
- * set, except when compatibility with classic scrypt or TMTO-friendliness are
- * desired.
- *
- * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
- * lower level as compared to where it is in classic scrypt.  This reduces
- * flexibility for efficient computation (for both attackers and defenders) by
- * requiring that, short of resorting to TMTO, the full amount of memory be
- * allocated as needed for the specified p, regardless of whether that
- * parallelism is actually being fully made use of or not.  (For comparison, a
- * single instance of classic scrypt may be computed in less memory without any
- * CPU time overhead, but in more real time, by not making full use of the
- * parallelism.)  This may be desirable when the defender has enough memory
- * with sufficiently low latency and high bandwidth for efficient full parallel
- * execution, yet the required memory size is high enough that some likely
- * attackers might end up being forced to choose between using higher latency
- * memory than they could use otherwise (waiting for data longer) or using TMTO
- * (waiting for data more times per one hash computation).  The area-time cost
- * for other kinds of attackers (who would use the same memory type and TMTO
- * factor or no TMTO either way) remains roughly the same, given the same
- * running time for the defender.  In the TMTO-friendly YESCRYPT_WORM mode, as
- * long as the defender has enough memory that is just as fast as the smaller
- * per-thread regions would be, doesn't expect to ever need greater
- * flexibility (except possibly via TMTO), and doesn't need backwards
- * compatibility with classic scrypt, there are no other serious drawbacks to
- * this setting.  In the YESCRYPT_RW mode, which is meant to discourage TMTO,
- * this new approach to parallelization makes TMTO less inefficient.  (This is
- * an unfortunate side-effect of avoiding some random writes, as we have to in
- * order to allow for parallel threads to access a common memory region without
- * synchronization overhead.)  Thus, in this mode this setting poses an extra
- * tradeoff of its own (higher area-time cost for a subset of attackers vs.
- * better TMTO resistance).  Setting YESCRYPT_PARALLEL_SMIX also changes the
- * way the running time is to be controlled from N*r*p (for classic scrypt) to
- * N*r (in this modification).  All of this applies only when p > 1.  For
- * p = 1, this setting is a no-op.
- *
- * Passing a real shared structure, with ROM contents previously computed by
- * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
- * the thread-local RAM region.  In order to allow for initialization of the
- * ROM to be split into a separate program, the shared->shared1.aligned and
- * shared->shared1.aligned_size fields may be set by the caller of
- * yescrypt_kdf() manually rather than with yescrypt_init_shared().
- *
- * local must be initialized with yescrypt_init_local().
- *
- * MT-safe as long as local and buf are local to the thread.
- */
-extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
-    yescrypt_local_t * __local,
-    const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __salt, size_t __saltlen,
-    uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
-    yescrypt_flags_t __flags,
-    uint8_t * __buf, size_t __buflen, int thrid);
-
-/**
- * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
- * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
- * parameters and salt value encoded in setting.  If the shared structure is
- * not dummy, a ROM is used and YESCRYPT_RW is required.  Otherwise, whether to
- * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
- * discouraging modification) is determined by the setting string.  shared and
- * local must be initialized as described above for yescrypt_kdf().  buf must
- * be large enough (as indicated by buflen) to hold the encoded hash string.
- *
- * Return the encoded hash string on success; or NULL on error.
- *
- * MT-safe as long as local and buf are local to the thread.
- */
-extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
-    yescrypt_local_t * __local,
-    const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __setting,
-    uint8_t * __buf, size_t __buflen, int thrid);
-
-/**
- * yescrypt(passwd, setting):
- * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
- * parameters and salt value encoded in setting.  Whether to use the
- * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
- * discouraging modification) is determined by the setting string.
- *
- * Return the encoded hash string on success; or NULL on error.
- *
- * This is a crypt(3)-like interface, which is simpler to use than
- * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
- * and it is slower than yescrypt_r() for repeated calls because it allocates
- * and frees memory on each call.
- *
- * MT-unsafe.
- */
-extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting, int thrid );
-
-/**
- * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
- * Generate a setting string for use with yescrypt_r() and yescrypt() by
- * encoding into it the parameters N_log2 (which is to be set to base 2
- * logarithm of the desired value for N), r, p, flags, and a salt given by src
- * (of srclen bytes).  buf must be large enough (as indicated by buflen) to
- * hold the setting string.
- *
- * Return the setting string on success; or NULL on error.
- *
- * MT-safe as long as buf is local to the thread.
- */
-extern uint8_t * yescrypt_gensalt_r(
-    uint32_t __N_log2, uint32_t __r, uint32_t __p,
-    yescrypt_flags_t __flags,
-    const uint8_t * __src, size_t __srclen,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
- * Generate a setting string for use with yescrypt_r() and yescrypt().  This
- * function is the same as yescrypt_gensalt_r() except that it uses a static
- * buffer and thus is not MT-safe.
- *
- * Return the setting string on success; or NULL on error.
- *
- * MT-unsafe.
- */
-extern uint8_t * yescrypt_gensalt(
-    uint32_t __N_log2, uint32_t __r, uint32_t __p,
-    yescrypt_flags_t __flags,
-    const uint8_t * __src, size_t __srclen);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c
index a52dea2c..54d119e2 100644
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -161,7 +161,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
 
 // Legacy Yescrypt (yespower v0.5)
 
-bool register_yescrypt_05_algo( algo_gate_t* gate )
+bool register_yescrypt_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
@@ -194,7 +194,7 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
 }
 
 
-bool register_yescryptr8_05_algo( algo_gate_t* gate )
+bool register_yescryptr8_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
@@ -207,7 +207,7 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
    return true;
 }
 
-bool register_yescryptr16_05_algo( algo_gate_t* gate )
+bool register_yescryptr16_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
@@ -220,7 +220,7 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
    return true;
 }
 
-bool register_yescryptr32_05_algo( algo_gate_t* gate )
+bool register_yescryptr32_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
diff --git a/configure b/configure
index 93e54874..28c974d2 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.20.1'
-PACKAGE_STRING='cpuminer-opt 3.20.1'
+PACKAGE_VERSION='3.20.2'
+PACKAGE_STRING='cpuminer-opt 3.20.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.20.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.20.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.20.1
+cpuminer-opt configure 3.20.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.20.1, which was
+It was created by cpuminer-opt $as_me 3.20.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.20.1'
+ VERSION='3.20.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.20.1, which was
+This file was extended by cpuminer-opt $as_me 3.20.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.20.1
+cpuminer-opt config.status 3.20.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index d0835055..637fb4bc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.20.1])
+AC_INIT([cpuminer-opt], [3.20.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 2a52eb2f..bca99ed2 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #endif
 
 // Mask making
-
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
 // Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.
 
 #define mm_movmask_64( v ) \
    _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
@@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Bit rotations
 
-// AVX512VL has implemented bit rotation for 128 bit vectors with
-// 64 and 32 bit elements.
-
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.
 
-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
-// specification but works with a variable. Therefore use rol_var where
-// necessary.
-// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
-
-#define mm128_ror_var_64( v, c ) \
-   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
-
-#define mm128_rol_var_64( v, c ) \
-   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
-
-#define mm128_ror_var_32( v, c ) \
-   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
-
-#define mm128_rol_var_32( v, c ) \
-   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
-
-
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)
 
 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #else  // SSE2
 
-#define mm128_ror_64   mm128_ror_var_64
-#define mm128_rol_64   mm128_rol_var_64
-#define mm128_ror_32   mm128_ror_var_32
-#define mm128_rol_32   mm128_rol_var_32
+#define mm128_ror_64( v, c ) \
+   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
+
+#define mm128_rol_64( v, c ) \
+   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
+
+#define mm128_ror_32( v, c ) \
+   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
+
+#define mm128_rol_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
 
 #define mm128_rorx2_64( v1, v0, c ) \
 { \
@@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
    _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
 
+// Deprecated.
+#define mm128_rol_var_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
+
+//
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from src a, and the high half from src b.
 #define mm128_shuffle2_64( a, b, c ) \
@@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
    _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
                                      _mm_castsi128_ps( b ), c ) ); 
 
-
 //
 // Rotate vector elements accross all lanes
 
@@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
 
+#if defined(__SSSE3__)
+
+// Rotate right by c bytes, no SSE2 equivalent.
+static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
+{ return _mm_alignr_epi8( v, v, c ); }
+
+#endif
+
+// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
+// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
+// (unlikely but faster), or when SSSE3 is not available (slower).
 
-// Swap 32 bit elements in 64 bit lanes
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
 #define mm128_shuflr64_32 mm128_swap64_32
 #define mm128_shufll64_32 mm128_swap64_32
 
-#if defined(__SSSE3__)
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_24( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#else
+  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
+#endif
 
-// Rotate right by c bytes, no SSE2 equivalent.
-static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
-{ return _mm_alignr_epi8( v, v, c ); }
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#else
+  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
+#endif
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_swap32_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#else
+  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
+#endif
+#define mm128_shuflr32_16 mm128_swap32_16
+#define mm128_shufll32_16 mm128_swap32_16
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr32_8( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#else
+  #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
+#endif
 
 //
 // Endian byte swap.
 
+#if defined(__SSSE3__)
+
 #define mm128_bswap_64( v ) \
    _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                        0x0001020304050607 ) )
@@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 //
 // Rotate in place concatenated 128 bit vectors as one 256 bit vector.
 
-// Swap 128 bit vectorse.
-
+// Swap 128 bit vectors.
+// This should be avoided, it's more efficient to switch references.
 #define mm128_swap256_128( v1, v2 ) \
    v1 = _mm_xor_si128( v1, v2 ); \
    v2 = _mm_xor_si128( v1, v2 ); \
@@ -552,8 +580,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 
 // Function macros with two inputs and one output, inputs are preserved.
 // Returns the high 128 bits, ie updated v1.
-// These two-input functions are not available without SSSE3. Use procedure
-// macros below instead.
+// These functions are preferred but only available with SSSE3. Use procedure
+// macros below for SSE2 compatibility.
 
 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
@@ -568,8 +596,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
 
 // Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
-// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
-// with existing code. The function macros above can be used more effciently.
+// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
+// compatibility with with existing code. 
 
 #define mm128_vror256_64( v1, v2 ) \
 do { \
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 025436f6..2e9423e5 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -13,6 +13,18 @@
 // AVX512 implementations. They will be selected automatically but their use
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
+//
+// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
+// version is not. Some usage has the index vector encoded as if full vector
+// shuffles are supported. This has no side effects and would have the same
+// results using either version.
+// If needed and AVX512 is available, 256 bit full vector shuffles can be
+// implemented using the AVX512 zero-mask feature with a NULL mask.
+// Using intrinsics it's simple:
+//   _mm256_maskz_shuffle_epi8( k0, v, c )
+// With asm it's a bit more complicated with the addition of the mask register
+// and zero tag:
+//   vpshufb ymm0{k0}{z}, ymm1, ymm2 
 
 #if defined(__AVX__)
 
@@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #endif
 
 // Mask making
-
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
 // Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.
 
 #define mm256_movmask_64( v ) \
    _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128 which is slow.
-//
-// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
-//
 // x2 rotates elements in 2 individual vectors in a double buffered
-// optimization for SSE2, does nothing for AVX512 but is there for
+// optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.
 
-
-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
-// necessary. 
-
-#define mm256_ror_var_64( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
-                    _mm256_slli_epi64( v, 64-(c) ) )
-
-#define mm256_rol_var_64( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
-                    _mm256_srli_epi64( v, 64-(c) ) )
-
-#define mm256_ror_var_32( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
-                    _mm256_slli_epi32( v, 32-(c) ) )
-
-#define mm256_rol_var_32( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
-                    _mm256_srli_epi32( v, 32-(c) ) )
-
-
-// The spec says both F & VL are required, but just in case AMD
-// decides to implement ROL/R without AVX512F.
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)
-
-// AVX512, control must be 8 bit immediate.
 
 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 #else   // AVX2
 
-#define mm256_ror_64    mm256_ror_var_64 
-#define mm256_rol_64    mm256_rol_var_64
-#define mm256_ror_32    mm256_ror_var_32
-#define mm256_rol_32    mm256_rol_var_32
+// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
+
+#define mm256_ror_64( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
+                    _mm256_slli_epi64( v, 64-(c) ) )
+
+#define mm256_rol_64( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
+                    _mm256_srli_epi64( v, 64-(c) ) )
+
+#define mm256_ror_32( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
+                    _mm256_slli_epi32( v, 32-(c) ) )
+
+#define mm256_rol_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )
 
 #define mm256_rorx2_64( v1, v0, c ) \
 { \
@@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
    _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                     _mm256_srli_epi16( v, 16-(c) ) )
 
+// Deprecated.
+#define mm256_rol_var_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )
 
 //
 // Rotate elements accross all lanes.
@@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 
 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
-
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )
 
 // Rotate 256 bit vector by one 32 bit element.
@@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                      m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                     0x0000000200000001,  0x0000000000000007 ) )
 
-       
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.
 
@@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
    _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
                                            _mm256_castsi256_ps( b ), c ) ); 
 
-
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
 #define mm256_shufll128_64 mm256_swap128_64
@@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 
-// Swap 32 bit elements in each 64 bit lane.
+// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
+// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
+// AVX512 is available.
+
 #define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32
 
+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
+#else
+  #define mm256_shuflr64_24( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
+#else
+  #define mm256_shuflr64_16( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
+#else
+  #define mm256_swap32_16( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#endif
+#define mm256_shuflr32_16 mm256_swap32_16
+#define mm256_shufll32_16 mm256_swap32_16
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
+#else
+  #define mm256_shuflr32_8( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201, \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#endif
+
 // NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
 // lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
 // AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
@@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
   casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
 } while(0)
 
-//
-// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
-// number of elements. Rotate is done in place, source arguments are
-// overwritten.
-// Some of these can use permute but appears to be slower. Maybe a Ryzen
-// issue
-
-//  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
-//  makes these macros unnecessary.
-
-// continue using vror/vrol notation for now to avoid confusion with
-// shufl2r/shufl2l macro functions available with AVX512.
+// swap 256 bit vectors in place.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
    v1 = _mm256_xor_si256( v1, v2 ); \
    v2 = _mm256_xor_si256( v1, v2 ); \
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 237af5f1..4c35df6b 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Bit rotations.
 
 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
-// elements and can be called directly. But they only accept immediate 8
-// for control arg. 
-// The workaround is a fraud, just a fluke of the compiler's optimizer.
-// It fails without -O3. The compiler seems to unroll shift loops, eliminating
-// the variable control, better than rotate loops. 
+// elements and can be called directly. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //
 
-// For convenience and consistency with AVX2
+// For convenience and consistency with AVX2 macros.
 #define mm512_ror_64 _mm512_ror_epi64
 #define mm512_rol_64 _mm512_rol_epi64
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32
 
-static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi64( v, c ),
-                           _mm512_slli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi64( v, c ),
-                           _mm512_srli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi32( v, c ),
-                           _mm512_slli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi32( v, c ),
-                           _mm512_srli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_ror_16( __m512i const v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi16( v, c ),
-                           _mm512_slli_epi16( v, 16-c ) );
-}
-
-static inline __m512i mm512_rol_16( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi16( v, c ),
-                           _mm512_srli_epi16( v, 16-c ) );
-}
-
 // Rotations using a vector control index are very slow due to overhead
 // to generate the index vector. Repeated rotations using the same index
 // are better handled by the calling function where the index only needs
@@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }
 
-// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
-// but only with AVX512. Shuffle is just as fast and availble with AVX2
-// & SSE2.
+// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
+// can be done with ror & rol. Defined only for convenience and consistency
+// with AVX2 & SSE2 macros.
+
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
 #define mm512_shuflr64_32 mm512_swap64_32
 #define mm512_shufll64_32 mm512_swap64_32
 
-// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
-// and 2 input 2 output shuffle macros.
-//
-// shuflr is 1 input
-// shufl2r is 2 input ...
-// Drop macros? They can easilly be rebuilt using shufl2 functions
+#define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
+#define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
+
+#define mm512_shuflr64_16( v )  _mm512_ror_epi64( v, 16 )
+#define mm512_shufll64_16( v )  _mm512_rol_epi64( v, 16 )
+
+#define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
+#define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )
+
+#define mm512_swap32_16(   v )  _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16 mm512_swap32_16
+#define mm512_shufll32_16 mm512_swap32_16
+
+#define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )
+
 
 // 2 input, 1 output
-// Rotate concatenated { v1, v2 ) right or left and return v1. 
+// Concatenate { v1, v2 ) then rotate right or left and return the high
+// 512 bits, ie rotated v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )