From 58030e27886ff41612fe7676ea0ac75e815bbde9 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Mon, 1 Aug 2022 20:21:05 -0400 Subject: [PATCH] v3.20.2 --- Makefile.am | 2 - RELEASE_NOTES | 5 + algo-gate-api.c | 12 +- algo/blake/blake256-hash-4way.c | 79 +- algo/blake/blake2b-hash-4way.c | 6 +- algo/blake/blake2s-hash-4way.c | 8 +- algo/blake/blake512-hash-4way.c | 58 +- algo/blake/sph_blake2b.c | 55 +- algo/heavy/sph_hefty1.c | 382 -------- algo/heavy/sph_hefty1.h | 66 -- algo/lyra2/sponge.h | 12 +- algo/radiogatun/sph_radiogatun.c | 1003 --------------------- algo/radiogatun/sph_radiogatun.h | 186 ---- algo/x20/x20r-gate.c | 34 - algo/x20/x20r-gate.h | 58 -- algo/x20/x20r.c | 252 ------ algo/yescrypt/yescrypt-best.c | 5 - algo/yescrypt/yescrypt-platform.h | 213 ----- algo/yescrypt/yescrypt-simd.c | 1392 ----------------------------- algo/yescrypt/yescrypt.c | 488 ---------- algo/yescrypt/yescrypt.h | 382 -------- algo/yespower/yespower-gate.c | 8 +- configure | 20 +- configure.ac | 2 +- simd-utils/simd-128.h | 108 ++- simd-utils/simd-256.h | 132 +-- simd-utils/simd-512.h | 76 +- 27 files changed, 311 insertions(+), 4733 deletions(-) delete mode 100644 algo/heavy/sph_hefty1.c delete mode 100644 algo/heavy/sph_hefty1.h delete mode 100644 algo/radiogatun/sph_radiogatun.c delete mode 100644 algo/radiogatun/sph_radiogatun.h delete mode 100644 algo/x20/x20r-gate.c delete mode 100644 algo/x20/x20r-gate.h delete mode 100644 algo/x20/x20r.c delete mode 100644 algo/yescrypt/yescrypt-best.c delete mode 100644 algo/yescrypt/yescrypt-platform.h delete mode 100644 algo/yescrypt/yescrypt-simd.c delete mode 100644 algo/yescrypt/yescrypt.c delete mode 100644 algo/yescrypt/yescrypt.h diff --git a/Makefile.am b/Makefile.am index 82eeb6f6..b88b1e19 100644 --- a/Makefile.am +++ b/Makefile.am @@ -285,8 +285,6 @@ cpuminer_SOURCES = \ algo/x22/x22i-gate.c \ algo/x22/x25x.c \ algo/x22/x25x-4way.c \ - algo/yescrypt/yescrypt.c \ - algo/yescrypt/yescrypt-best.c \ algo/yespower/yespower-gate.c \ algo/yespower/yespower-blake2b.c \ algo/yespower/crypto/hmac-blake2b.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 5e3c78b2..1f184898 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,11 @@ If not what makes it happen or not happen? Change Log ---------- +v3.20.2 + +Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2. +Removed old unused yescrypt library and other unused code. + v3.20.1 sph_blake2b optimized 1-way SSSE3 & AVX2. diff --git a/algo-gate-api.c b/algo-gate-api.c index f34f5ac0..dcbad060 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -371,15 +371,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_X22I: rc = register_x22i_algo ( gate ); break; case ALGO_X25X: rc = register_x25x_algo ( gate ); break; case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break; - case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break; -// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break; - case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break; -// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break; + case ALGO_YESCRYPT: rc = register_yescrypt_algo ( gate ); break; + case ALGO_YESCRYPTR8: rc = register_yescryptr8_algo ( gate ); break; case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break; - case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break; -// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break; - case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break; -// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break; + case ALGO_YESCRYPTR16: rc = register_yescryptr16_algo ( gate ); break; + case ALGO_YESCRYPTR32: rc = register_yescryptr32_algo ( gate ); break; case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break; case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break; case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break; diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index c55e0133..d3260677 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -400,18 +400,18 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, // Blake-256 4 way #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \ -do { \ +{ \ a = _mm_add_epi32( _mm_add_epi32( a, b ), \ _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \ + d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ a = _mm_add_epi32( _mm_add_epi32( a, b ), \ _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \ + d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ -} while (0) +} #if SPH_COMPACT_BLAKE_32 @@ -441,7 +441,8 @@ do { \ #else -#define ROUND_S_4WAY(r) do { \ +#define ROUND_S_4WAY(r) \ +{ \ GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ @@ -450,7 +451,7 @@ do { \ GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ -} while (0) +} #endif @@ -537,7 +538,7 @@ do { \ #if defined(__SSSE3__) -#define BLAKE256_4WAY_BLOCK_BSWAP32 do \ +#define BLAKE256_4WAY_BLOCK_BSWAP32 \ { \ __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ 0x0405060700010203 ); \ @@ -557,11 +558,11 @@ do { \ MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \ ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \ MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \ -} while(0) +} #else // SSE2 -#define BLAKE256_4WAY_BLOCK_BSWAP32 do \ +#define BLAKE256_4WAY_BLOCK_BSWAP32 \ { \ M0 = mm128_bswap_32( buf[0] ); \ M1 = mm128_bswap_32( buf[1] ); \ @@ -579,12 +580,12 @@ do { \ MD = mm128_bswap_32( buf[13] ); \ ME = mm128_bswap_32( buf[14] ); \ MF = mm128_bswap_32( buf[15] ); \ -} while(0) +} #endif // SSSE3 else SSE2 #define COMPRESS32_4WAY( rounds ) \ -do { \ +{ \ __m128i M0, M1, M2, M3, M4, M5, M6, M7; \ __m128i M8, M9, MA, MB, MC, MD, ME, MF; \ __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -631,7 +632,7 @@ do { \ H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \ H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \ H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \ -} while (0) +} #endif @@ -642,20 +643,21 @@ do { \ // Blake-256 8 way #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \ -do { \ +{ \ a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \ _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \ + d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \ _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ + d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ -} while (0) +} -#define ROUND_S_8WAY(r) do { \ +#define ROUND_S_8WAY(r) \ +{ \ GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ @@ -664,7 +666,7 @@ do { \ GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ -} while (0) +} #define DECL_STATE32_8WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ @@ -699,7 +701,7 @@ do { \ } while (0) #define COMPRESS32_8WAY( rounds ) \ -do { \ +{ \ __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -764,10 +766,10 @@ do { \ H5 = mm256_xor3( VD, V5, H5 ); \ H6 = mm256_xor3( VE, V6, H6 ); \ H7 = mm256_xor3( VF, V7, H7 ); \ -} while (0) +} #define COMPRESS32_8WAY_LE( rounds ) \ -do { \ +{ \ __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -829,7 +831,7 @@ do { \ H5 = mm256_xor3( VD, V5, H5 ); \ H6 = mm256_xor3( VE, V6, H6 ); \ H7 = mm256_xor3( VF, V7, H7 ); \ -} while (0) +} void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, const void *data ) @@ -861,7 +863,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, // G1 V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ), _mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) ); - V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 ); + V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) ); V[ 9] = _mm256_add_epi32( V[ 9], V[13] ); V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 ); V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] ); @@ -881,7 +883,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, // G7 V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ), _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) ); - V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 ); + V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) ); V[ 3] = _mm256_add_epi32( V[ 3], _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) ); } @@ -935,18 +937,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, // G1 V1 = _mm256_add_epi32( V1, _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) ); - VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 ); + VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) ); V9 = _mm256_add_epi32( V9, VD ); V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 ); // G4 V0 = _mm256_add_epi32( V0, V5 ); - VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 ); + VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) ); VA = _mm256_add_epi32( VA, VF ); V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 ); V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5, _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) ); - VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 ); + VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) ); VA = _mm256_add_epi32( VA, VF ); V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 ); @@ -954,12 +956,12 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); // G6 - VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 ); + VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) ); V8 = _mm256_add_epi32( V8, VD ); V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 ); V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ), _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) ); - VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 ); + VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) ); V8 = _mm256_add_epi32( V8, VD ); V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 ); @@ -967,7 +969,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, V9 = _mm256_add_epi32( V9, VE ); V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 ); V3 = _mm256_add_epi32( V3, V4 ); - VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 ); + VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) ); V9 = _mm256_add_epi32( V9, VE ); V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 ); @@ -1009,7 +1011,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, // Blake-256 16 way AVX512 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \ -do { \ +{ \ a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \ _mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \ d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \ @@ -1020,9 +1022,10 @@ do { \ d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \ c = _mm512_add_epi32( c, d ); \ b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \ -} while (0) +} -#define ROUND_S_16WAY(r) do { \ +#define ROUND_S_16WAY(r) \ +{ \ GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ @@ -1031,7 +1034,7 @@ do { \ GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ -} while (0) +} #define DECL_STATE32_16WAY \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \ @@ -1066,7 +1069,7 @@ do { \ } while (0) #define COMPRESS32_16WAY( rounds ) \ -do { \ +{ \ __m512i M0, M1, M2, M3, M4, M5, M6, M7; \ __m512i M8, M9, MA, MB, MC, MD, ME, MF; \ __m512i V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -1133,10 +1136,10 @@ do { \ H5 = mm512_xor3( VD, V5, H5 ); \ H6 = mm512_xor3( VE, V6, H6 ); \ H7 = mm512_xor3( VF, V7, H7 ); \ -} while (0) +} #define COMPRESS32_16WAY_LE( rounds ) \ -do { \ +{ \ __m512i M0, M1, M2, M3, M4, M5, M6, M7; \ __m512i M8, M9, MA, MB, MC, MD, ME, MF; \ __m512i V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -1198,7 +1201,7 @@ do { \ H5 = mm512_xor3( VD, V5, H5 ); \ H6 = mm512_xor3( VE, V6, H6 ); \ H7 = mm512_xor3( VF, V7, H7 ); \ -} while (0) +} // Blake-256 prehash of the second block is split onto 2 parts. The first part // is constant for every nonce and only needs to be run once per job. The diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c index d04601f3..6437c7ba 100644 --- a/algo/blake/blake2b-hash-4way.c +++ b/algo/blake/blake2b-hash-4way.c @@ -388,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ) #define B2B_G(a, b, c, d, x, y) \ { \ v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \ - v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \ + v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \ v[c] = _mm256_add_epi64( v[c], v[d] ); \ - v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \ + v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \ v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \ - v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \ + v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \ v[c] = _mm256_add_epi64( v[c], v[d] ); \ v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \ } diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c index 190ad0b7..a69e5010 100644 --- a/algo/blake/blake2s-hash-4way.c +++ b/algo/blake/blake2s-hash-4way.c @@ -108,11 +108,11 @@ do { \ uint8_t s0 = sigma0; \ uint8_t s1 = sigma1; \ a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \ + d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \ - d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \ + d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ } while(0) @@ -320,11 +320,11 @@ do { \ uint8_t s0 = sigma0; \ uint8_t s1 = sigma1; \ a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \ + d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ + d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ } while(0) diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index 5c947e18..ebd5d958 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -314,10 +314,11 @@ static const sph_u64 CB[16] = { // Blake-512 8 way AVX512 -#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \ +#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \ +{ \ a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \ _mm512_set1_epi64( c1 ), m0 ), b ), a ); \ - d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \ + d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \ c = _mm512_add_epi64( c, d ); \ b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \ a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \ @@ -325,9 +326,10 @@ static const sph_u64 CB[16] = { d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \ c = _mm512_add_epi64( c, d ); \ b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \ -} while (0) +} -#define ROUND_B_8WAY(r) do { \ +#define ROUND_B_8WAY( r ) \ +{ \ GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ @@ -336,13 +338,13 @@ static const sph_u64 CB[16] = { GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ - } while (0) +} #define DECL_STATE64_8WAY \ __m512i H0, H1, H2, H3, H4, H5, H6, H7; \ uint64_t T0, T1; -#define COMPRESS64_8WAY( buf ) do \ +#define COMPRESS64_8WAY( buf ) \ { \ __m512i M0, M1, M2, M3, M4, M5, M6, M7; \ __m512i M8, M9, MA, MB, MC, MD, ME, MF; \ @@ -409,7 +411,7 @@ static const sph_u64 CB[16] = { H5 = mm512_xor3( VD, V5, H5 ); \ H6 = mm512_xor3( VE, V6, H6 ); \ H7 = mm512_xor3( VF, V7, H7 ); \ -} while (0) +} void blake512_8way_compress( blake_8way_big_context *sc ) { @@ -610,7 +612,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate, V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); - VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); + VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) ); VA = _mm512_add_epi64( VA, VF ); V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 ); V0 = _mm512_add_epi64( V0, V5 ); @@ -714,7 +716,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash, // V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 ); V1 = _mm512_add_epi64( V1, V5 ); - VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 ); + VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) ); V9 = _mm512_add_epi64( V9, VD ); V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 ); V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512( @@ -728,7 +730,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash, // V2 = _mm512_add_epi64( V2, V6 ); V2 = _mm512_add_epi64( V2, _mm512_xor_si512( _mm512_set1_epi64( CBF ), M9 ) ); - VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 ); + VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) ); VA = _mm512_add_epi64( VA, VE ); V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 ); V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512( @@ -742,7 +744,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash, // V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( // _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); - VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); + VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) ); VB = _mm512_add_epi64( VB, VF ); V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 ); V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( @@ -1054,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst) // Blake-512 4 way -#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \ +#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \ +{ \ a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \ - d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \ + d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi64( c, d ); \ b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \ a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \ - d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \ + d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi64( c, d ); \ b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \ -} while (0) +} -#define ROUND_B_4WAY(r) do { \ +#define ROUND_B_4WAY(r) \ +{ \ GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ @@ -1076,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst) GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ - } while (0) +} #define DECL_STATE64_4WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ uint64_t T0, T1; -#define COMPRESS64_4WAY do \ +#define COMPRESS64_4WAY \ { \ __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ @@ -1147,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst) H5 = mm256_xor3( VD, V5, H5 ); \ H6 = mm256_xor3( VE, V6, H6 ); \ H7 = mm256_xor3( VF, V7, H7 ); \ -} while (0) +} void blake512_4way_compress( blake_4way_big_context *sc ) @@ -1277,7 +1281,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate, // G4 skip nonce V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 ); - VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 ); + VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) ); VA = _mm256_add_epi64( VA, VF ); V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 ); V0 = _mm256_add_epi64( V0, V5 ); @@ -1364,7 +1368,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash, // finish round 0, with the nonce now available V0 = _mm256_add_epi64( V0, _mm256_xor_si256( _mm256_set1_epi64x( CB8 ), M9 ) ); - VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 ); + VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) ); VA = _mm256_add_epi64( VA, VF ); V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 ); @@ -1374,34 +1378,34 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash, // G1 V1 = _mm256_add_epi64( V1, V5 ); - VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 ); + VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) ); V9 = _mm256_add_epi64( V9, VD ); V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 ); V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256( _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) ); - VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 ); + VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) ); V9 = _mm256_add_epi64( V9, VD ); V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 ); // G2 V2 = _mm256_add_epi64( V2, _mm256_xor_si256( _mm256_set1_epi64x( CBF ), M9 ) ); - VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 ); + VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) ); VA = _mm256_add_epi64( VA, VE ); V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 ); V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256( _mm256_set1_epi64x( CB9 ), MF ), V6 ) ); - VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 ); + VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) ); VA = _mm256_add_epi64( VA, VE ); V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 ); // G3 - VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 ); + VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) ); VB = _mm256_add_epi64( VB, VF ); V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 ); V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256( _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) ); - VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 ); + VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) ); VB = _mm256_add_epi64( VB, VF ); V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 ); diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 9e13fe4f..159d6076 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -35,7 +35,6 @@ #include "sph_blake2b.h" // Little-endian byte access. - #define B2B_GET64(p) \ (((uint64_t) ((uint8_t *) (p))[0]) ^ \ (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ @@ -46,30 +45,34 @@ (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ (((uint64_t) ((uint8_t *) (p))[7]) << 56)) -// G Mixing function. - #if defined(__AVX2__) -#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \ +#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \ { \ V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \ - _mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \ - m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \ - V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \ + _mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \ + m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ + V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \ + V[2] = _mm256_add_epi64( V[2], V[3] ); \ + V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \ +\ + V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \ + _mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \ + m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ + V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \ V[2] = _mm256_add_epi64( V[2], V[3] ); \ - V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \ + V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \ } #define BLAKE2B_ROUND( R ) \ { \ __m256i *V = (__m256i*)v; \ - BLAKE2B_G( R, 0, 2, 4, 6, 32, 24 ); \ - BLAKE2B_G( R, 1, 3, 5, 7, 16, 63 ); \ + const uint8_t *sigmaR = sigma[R]; \ + BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \ V[3] = mm256_shufll_64( V[3] ); \ V[2] = mm256_swap_128( V[2] ); \ V[1] = mm256_shuflr_64( V[1] ); \ - BLAKE2B_G( R, 8, 10, 12, 14, 32, 24 ); \ - BLAKE2B_G( R, 9, 11, 13, 15, 16, 63 ); \ + BLAKE2B_G( 8, 9, 10, 11, 12, 13, 14, 15 ); \ V[3] = mm256_shuflr_64( V[3] ); \ V[2] = mm256_swap_128( V[2] ); \ V[1] = mm256_shufll_64( V[1] ); \ @@ -77,31 +80,34 @@ #elif defined(__SSSE3__) -#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \ +#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \ { \ Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ - _mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \ - Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \ + _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ + Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \ + Vc = _mm_add_epi64( Vc, Vd ); \ + Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \ +\ + Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ + _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ + Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \ Vc = _mm_add_epi64( Vc, Vd ); \ - Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \ + Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \ } #define BLAKE2B_ROUND( R ) \ { \ __m128i *V = (__m128i*)v; \ __m128i V2, V3, V6, V7; \ - BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \ - BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \ - BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \ - BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \ + const uint8_t *sigmaR = sigma[R]; \ + BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ + BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ V2 = mm128_shufl2r_64( V[2], V[3] ); \ V3 = mm128_shufl2r_64( V[3], V[2] ); \ V6 = mm128_shufl2l_64( V[6], V[7] ); \ V7 = mm128_shufl2l_64( V[7], V[6] ); \ - BLAKE2B_G( R, V[0], V2, V[5], V6, 8, 10, 32, 24 ); \ - BLAKE2B_G( R, V[0], V2, V[5], V6, 9, 11, 16, 63 ); \ - BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \ - BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \ + BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ + BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ V[2] = mm128_shufl2l_64( V2, V3 ); \ V[3] = mm128_shufl2l_64( V3, V2 ); \ V[6] = mm128_shufl2r_64( V6, V7 ); \ @@ -120,6 +126,7 @@ Vd = ROTR64( Vd ^ Va, 32 ); \ Vc = Vc + Vd; \ Vb = ROTR64( Vb ^ Vc, 24 ); \ +\ Va = Va + Vb + m[ sigma[R][Sb] ]; \ Vd = ROTR64( Vd ^ Va, 16 ); \ Vc = Vc + Vd; \ diff --git a/algo/heavy/sph_hefty1.c b/algo/heavy/sph_hefty1.c deleted file mode 100644 index 8a8203cf..00000000 --- a/algo/heavy/sph_hefty1.c +++ /dev/null @@ -1,382 +0,0 @@ -/* - * HEFTY1 cryptographic hash function - * - * Copyright (c) 2014, dbcc14 - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the FreeBSD Project. - */ - -#include -#include - -#ifdef _MSC_VER -#define inline __inline -#endif - -#include "sph_hefty1.h" - -#define Min(A, B) (A <= B ? A : B) -#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \ - { \ - /* To thwart parallelism, Br modifies itself each time it's \ - * called. This also means that calling it in different \ - * orders yeilds different results. In C the order of \ - * evaluation of function arguments and + operands are \ - * unspecified (and depends on the compiler), so we must make \ - * the order of Br calls explicit. \ - */ \ - uint32_t brG = Br(ctx, G); \ - uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \ - uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \ - uint32_t brC = Br(ctx, C); \ - uint32_t brB = Br(ctx, B); \ - uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \ - uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \ - H = G; \ - G = F; \ - F = E; \ - E = D + Br(ctx, tmp2); \ - D = C; \ - C = B; \ - B = A; \ - A = tmp2 + tmp4; \ - } \ - -/* Nothing up my sleeve constants */ -const static uint32_t K[64] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, - 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, - 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, - 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, - 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, - 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, - 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, - 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, - 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, - 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, - 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, - 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, - 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL -}; - -/* Initial hash values */ -const static uint32_t H[HEFTY1_STATE_WORDS] = { - 0x6a09e667UL, - 0xbb67ae85UL, - 0x3c6ef372UL, - 0xa54ff53aUL, - 0x510e527fUL, - 0x9b05688cUL, - 0x1f83d9abUL, - 0x5be0cd19UL -}; - -static inline uint32_t Rr(uint32_t X, uint8_t n) -{ - return (X >> n) | (X << (32 - n)); -} - -static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G) -{ - return (E & F) ^ (~E & G); -} - -static inline uint32_t Sigma1(uint32_t E) -{ - return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25); -} - -static inline uint32_t sigma1(uint32_t X) -{ - return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10); -} - -static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C) -{ - return (A & B) ^ (A & C) ^ (B & C); -} - -static inline uint32_t Sigma0(uint32_t A) -{ - return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22); -} - -static inline uint32_t sigma0(uint32_t X) -{ - return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3); -} - -static inline uint32_t Reverse32(uint32_t n) -{ - #if BYTE_ORDER == LITTLE_ENDIAN - return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24; - #else - return n; - #endif -} - -static inline uint64_t Reverse64(uint64_t n) -{ - #if BYTE_ORDER == LITTLE_ENDIAN - uint32_t a = n >> 32; - uint32_t b = (n << 32) >> 32; - - return (uint64_t)Reverse32(b) << 32 | Reverse32(a); - #else - return n; - #endif -} - -/* Smoosh byte into nibble */ -static inline uint8_t Smoosh4(uint8_t X) -{ - return (X >> 4) ^ (X & 0xf); -} - -/* Smoosh 32-bit word into 2-bits */ -static inline uint8_t Smoosh2(uint32_t X) -{ - uint16_t w = (X >> 16) ^ (X & 0xffff); - uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff)); - return (n >> 2) ^ (n & 0x3); -} - -static void Mangle(uint32_t *S) -{ - uint32_t *R = S; - uint32_t *C = &S[1]; - - uint8_t r0 = Smoosh4(R[0] >> 24); - uint8_t r1 = Smoosh4(R[0] >> 16); - uint8_t r2 = Smoosh4(R[0] >> 8); - uint8_t r3 = Smoosh4(R[0] & 0xff); - - int i; - - /* Diffuse */ - uint32_t tmp = 0; - for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) { - uint8_t r = Smoosh2(tmp); - switch (r) { - case 0: - C[i] ^= Rr(R[0], i + r0); - break; - case 1: - C[i] += Rr(~R[0], i + r1); - break; - case 2: - C[i] &= Rr(~R[0], i + r2); - break; - case 3: - C[i] ^= Rr(R[0], i + r3); - break; - } - tmp ^= C[i]; - } - - /* Compress */ - tmp = 0; - for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) - if (i % 2) - tmp ^= C[i]; - else - tmp += C[i]; - R[0] ^= tmp; -} - -static void Absorb(uint32_t *S, uint32_t X) -{ - uint32_t *R = S; - R[0] ^= X; - Mangle(S); -} - -static uint32_t Squeeze(uint32_t *S) -{ - uint32_t Y = S[0]; - Mangle(S); - return Y; -} - -/* Branch, compress and serialize function */ -static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X) -{ - uint32_t R = Squeeze(ctx->sponge); - - uint8_t r0 = R >> 8; - uint8_t r1 = R & 0xff; - - uint32_t Y = 1 << (r0 % 32); - - switch (r1 % 4) - { - case 0: - /* Do nothing */ - break; - case 1: - return X & ~Y; - case 2: - return X | Y; - case 3: - return X ^ Y; - } - - return X; -} - -static void HashBlock(HEFTY1_CTX *ctx) -{ - uint32_t A, B, C, D, E, F, G, H; - uint32_t W[HEFTY1_BLOCK_BYTES]; - - assert(ctx); - - A = ctx->h[0]; - B = ctx->h[1]; - C = ctx->h[2]; - D = ctx->h[3]; - E = ctx->h[4]; - F = ctx->h[5]; - G = ctx->h[6]; - H = ctx->h[7]; - - int t = 0; - for (; t < 16; t++) { - W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */ - Absorb(ctx->sponge, W[t] ^ K[t]); - } - - for (t = 0; t < 16; t++) { - Absorb(ctx->sponge, D ^ H); - RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); - } - for (t = 16; t < 64; t++) { - Absorb(ctx->sponge, H + D); - W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16]; - RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); - } - - ctx->h[0] += A; - ctx->h[1] += B; - ctx->h[2] += C; - ctx->h[3] += D; - ctx->h[4] += E; - ctx->h[5] += F; - ctx->h[6] += G; - ctx->h[7] += H; - - A = 0; - B = 0; - C = 0; - D = 0; - E = 0; - F = 0; - G = 0; - H = 0; - - memset(W, 0, sizeof(W)); -} - -/* Public interface */ - -void HEFTY1_Init(HEFTY1_CTX *ctx) -{ - assert(ctx); - - memcpy(ctx->h, H, sizeof(ctx->h)); - memset(ctx->block, 0, sizeof(ctx->block)); - ctx->written = 0; - memset(ctx->sponge, 0, sizeof(ctx->sponge)); -} - -void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len) -{ - assert(ctx); - - uint64_t read = 0; - while (len) { - size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES); - size_t count = Min(len, HEFTY1_BLOCK_BYTES - end); - memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count); - len -= count; - read += count; - ctx->written += count; - if (!(ctx->written % HEFTY1_BLOCK_BYTES)) - HashBlock(ctx); - } -} - -void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx) -{ - assert(digest); - assert(ctx); - - /* Pad message (FIPS 180 Section 5.1.1) */ - size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES); - ctx->block[used++] = 0x80; /* Append 1 to end of message */ - if (used > HEFTY1_BLOCK_BYTES - 8) { - /* We have already written into the last 64bits, so - * we must continue into the next block. */ - memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used); - HashBlock(ctx); - used = 0; /* Create a new block (below) */ - } - - /* All remaining bits to zero */ - memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used); - - /* The last 64bits encode the length (in network byte order) */ - uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8]; - *len = Reverse64(ctx->written*8); - - HashBlock(ctx); - - /* Convert back to network byte order */ - int i = 0; - for (; i < HEFTY1_STATE_WORDS; i++) - ctx->h[i] = Reverse32(ctx->h[i]); - - memcpy(digest, ctx->h, sizeof(ctx->h)); - memset(ctx, 0, sizeof(HEFTY1_CTX)); -} - -unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest) -{ - HEFTY1_CTX ctx; - static unsigned char m[HEFTY1_DIGEST_BYTES]; - - if (!digest) - digest = m; - - HEFTY1_Init(&ctx); - HEFTY1_Update(&ctx, buf, len); - HEFTY1_Final(digest, &ctx); - - return digest; -} \ No newline at end of file diff --git a/algo/heavy/sph_hefty1.h b/algo/heavy/sph_hefty1.h deleted file mode 100644 index afcd274f..00000000 --- a/algo/heavy/sph_hefty1.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * HEFTY1 cryptographic hash function - * - * Copyright (c) 2014, dbcc14 - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * The views and conclusions contained in the software and documentation are those - * of the authors and should not be interpreted as representing official policies, - * either expressed or implied, of the FreeBSD Project. - */ - -#ifndef __HEFTY1_H__ -#define __HEFTY1_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef WIN32 -#include -#endif - -#include - -#define HEFTY1_DIGEST_BYTES 32 -#define HEFTY1_BLOCK_BYTES 64 -#define HEFTY1_STATE_WORDS 8 -#define HEFTY1_SPONGE_WORDS 4 - -typedef struct HEFTY1_CTX { - uint32_t h[HEFTY1_STATE_WORDS]; - uint8_t block[HEFTY1_BLOCK_BYTES]; - uint64_t written; - uint32_t sponge[HEFTY1_SPONGE_WORDS]; -} HEFTY1_CTX; - -void HEFTY1_Init(HEFTY1_CTX *cxt); -void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len); -void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt); -unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest); - -#ifdef __cplusplus -} -#endif - -#endif /* __HEFTY1_H__ */ \ No newline at end of file diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 2385640e..34df0cc0 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -97,11 +97,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ // returns void, updates all args #define G_4X64(a,b,c,d) \ a = _mm256_add_epi64( a, b ); \ - d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \ + d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi64( c, d ); \ - b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \ + b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \ a = _mm256_add_epi64( a, b ); \ - d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \ + d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \ c = _mm256_add_epi64( c, d ); \ b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 ); @@ -137,11 +137,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ // returns void, all args updated #define G_2X64(a,b,c,d) \ a = _mm_add_epi64( a, b ); \ - d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \ + d = mm128_swap64_32( _mm_xor_si128( d, a) ); \ c = _mm_add_epi64( c, d ); \ - b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \ + b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \ a = _mm_add_epi64( a, b ); \ - d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \ + d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \ c = _mm_add_epi64( c, d ); \ b = mm128_ror_64( _mm_xor_si128( b, c ), 63 ); diff --git a/algo/radiogatun/sph_radiogatun.c b/algo/radiogatun/sph_radiogatun.c deleted file mode 100644 index 888b028f..00000000 --- a/algo/radiogatun/sph_radiogatun.c +++ /dev/null @@ -1,1003 +0,0 @@ -/* $Id: radiogatun.c 226 2010-06-16 17:28:08Z tp $ */ -/* - * RadioGatun implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_radiogatun.h" - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_RADIOGATUN -#define SPH_SMALL_FOOTPRINT_RADIOGATUN 1 -#endif - -/* ======================================================================= */ -/* - * The core macros. We want to unroll 13 successive rounds so that the - * belt rotation becomes pure routing, solved at compilation time, with - * no unnecessary copying. We also wish all state variables to be - * independant local variables, so that the C compiler becomes free to - * map these on registers at it sees fit. This requires some heavy - * preprocessor trickeries, including a full addition macro modulo 13. - * - * These macros are size-independent. Some macros must be defined before - * use: - * WT evaluates to the type for a word (32-bit or 64-bit) - * T truncates a value to the proper word size - * ROR(x, n) right rotation of a word x, with explicit modular - * reduction of the rotation count n by the word size - * INW(i, j) input word j (0, 1, or 2) of block i (0 to 12) - * - * For INW, the input buffer is pointed to by "buf" which has type - * "const unsigned char *". - */ - -#define MUL19(action) do { \ - action(0); \ - action(1); \ - action(2); \ - action(3); \ - action(4); \ - action(5); \ - action(6); \ - action(7); \ - action(8); \ - action(9); \ - action(10); \ - action(11); \ - action(12); \ - action(13); \ - action(14); \ - action(15); \ - action(16); \ - action(17); \ - action(18); \ - } while (0) - -#define DECL19(b) b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \ - b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \ - b ## 12, b ## 13, b ## 14, b ## 15, b ## 16, \ - b ## 17, b ## 18 - -#define M19_T7(i) M19_T7_(i) -#define M19_T7_(i) M19_T7_ ## i -#define M19_T7_0 0 -#define M19_T7_1 7 -#define M19_T7_2 14 -#define M19_T7_3 2 -#define M19_T7_4 9 -#define M19_T7_5 16 -#define M19_T7_6 4 -#define M19_T7_7 11 -#define M19_T7_8 18 -#define M19_T7_9 6 -#define M19_T7_10 13 -#define M19_T7_11 1 -#define M19_T7_12 8 -#define M19_T7_13 15 -#define M19_T7_14 3 -#define M19_T7_15 10 -#define M19_T7_16 17 -#define M19_T7_17 5 -#define M19_T7_18 12 - -#define M19_A1(i) M19_A1_(i) -#define M19_A1_(i) M19_A1_ ## i -#define M19_A1_0 1 -#define M19_A1_1 2 -#define M19_A1_2 3 -#define M19_A1_3 4 -#define M19_A1_4 5 -#define M19_A1_5 6 -#define M19_A1_6 7 -#define M19_A1_7 8 -#define M19_A1_8 9 -#define M19_A1_9 10 -#define M19_A1_10 11 -#define M19_A1_11 12 -#define M19_A1_12 13 -#define M19_A1_13 14 -#define M19_A1_14 15 -#define M19_A1_15 16 -#define M19_A1_16 17 -#define M19_A1_17 18 -#define M19_A1_18 0 - -#define M19_A2(i) M19_A2_(i) -#define M19_A2_(i) M19_A2_ ## i -#define M19_A2_0 2 -#define M19_A2_1 3 -#define M19_A2_2 4 -#define M19_A2_3 5 -#define M19_A2_4 6 -#define M19_A2_5 7 -#define M19_A2_6 8 -#define M19_A2_7 9 -#define M19_A2_8 10 -#define M19_A2_9 11 -#define M19_A2_10 12 -#define M19_A2_11 13 -#define M19_A2_12 14 -#define M19_A2_13 15 -#define M19_A2_14 16 -#define M19_A2_15 17 -#define M19_A2_16 18 -#define M19_A2_17 0 -#define M19_A2_18 1 - -#define M19_A4(i) M19_A4_(i) -#define M19_A4_(i) M19_A4_ ## i -#define M19_A4_0 4 -#define M19_A4_1 5 -#define M19_A4_2 6 -#define M19_A4_3 7 -#define M19_A4_4 8 -#define M19_A4_5 9 -#define M19_A4_6 10 -#define M19_A4_7 11 -#define M19_A4_8 12 -#define M19_A4_9 13 -#define M19_A4_10 14 -#define M19_A4_11 15 -#define M19_A4_12 16 -#define M19_A4_13 17 -#define M19_A4_14 18 -#define M19_A4_15 0 -#define M19_A4_16 1 -#define M19_A4_17 2 -#define M19_A4_18 3 - -#define ACC_a(i) ACC_a_(i) -#define ACC_a_(i) a ## i -#define ACC_atmp(i) ACC_atmp_(i) -#define ACC_atmp_(i) atmp ## i - -#define MILL1(i) (atmp ## i = a ## i ^ T(ACC_a(M19_A1(i)) \ - | ~ACC_a(M19_A2(i)))) -#define MILL2(i) (a ## i = ROR(ACC_atmp(M19_T7(i)), ((i * (i + 1)) >> 1))) -#define MILL3(i) (atmp ## i = a ## i ^ ACC_a(M19_A1(i)) ^ ACC_a(M19_A4(i))) -#define MILL4(i) (a ## i = atmp ## i ^ (i == 0)) - -#define MILL do { \ - WT DECL19(atmp); \ - MUL19(MILL1); \ - MUL19(MILL2); \ - MUL19(MILL3); \ - MUL19(MILL4); \ - } while (0) - -#define DECL13(b) b ## 0 ## _0, b ## 0 ## _1, b ## 0 ## _2, \ - b ## 1 ## _0, b ## 1 ## _1, b ## 1 ## _2, \ - b ## 2 ## _0, b ## 2 ## _1, b ## 2 ## _2, \ - b ## 3 ## _0, b ## 3 ## _1, b ## 3 ## _2, \ - b ## 4 ## _0, b ## 4 ## _1, b ## 4 ## _2, \ - b ## 5 ## _0, b ## 5 ## _1, b ## 5 ## _2, \ - b ## 6 ## _0, b ## 6 ## _1, b ## 6 ## _2, \ - b ## 7 ## _0, b ## 7 ## _1, b ## 7 ## _2, \ - b ## 8 ## _0, b ## 8 ## _1, b ## 8 ## _2, \ - b ## 9 ## _0, b ## 9 ## _1, b ## 9 ## _2, \ - b ## 10 ## _0, b ## 10 ## _1, b ## 10 ## _2, \ - b ## 11 ## _0, b ## 11 ## _1, b ## 11 ## _2, \ - b ## 12 ## _0, b ## 12 ## _1, b ## 12 ## _2 - -#define M13_A(i, j) M13_A_(i, j) -#define M13_A_(i, j) M13_A_ ## i ## _ ## j -#define M13_A_0_0 0 -#define M13_A_0_1 1 -#define M13_A_0_2 2 -#define M13_A_0_3 3 -#define M13_A_0_4 4 -#define M13_A_0_5 5 -#define M13_A_0_6 6 -#define M13_A_0_7 7 -#define M13_A_0_8 8 -#define M13_A_0_9 9 -#define M13_A_0_10 10 -#define M13_A_0_11 11 -#define M13_A_0_12 12 -#define M13_A_1_0 1 -#define M13_A_1_1 2 -#define M13_A_1_2 3 -#define M13_A_1_3 4 -#define M13_A_1_4 5 -#define M13_A_1_5 6 -#define M13_A_1_6 7 -#define M13_A_1_7 8 -#define M13_A_1_8 9 -#define M13_A_1_9 10 -#define M13_A_1_10 11 -#define M13_A_1_11 12 -#define M13_A_1_12 0 -#define M13_A_2_0 2 -#define M13_A_2_1 3 -#define M13_A_2_2 4 -#define M13_A_2_3 5 -#define M13_A_2_4 6 -#define M13_A_2_5 7 -#define M13_A_2_6 8 -#define M13_A_2_7 9 -#define M13_A_2_8 10 -#define M13_A_2_9 11 -#define M13_A_2_10 12 -#define M13_A_2_11 0 -#define M13_A_2_12 1 -#define M13_A_3_0 3 -#define M13_A_3_1 4 -#define M13_A_3_2 5 -#define M13_A_3_3 6 -#define M13_A_3_4 7 -#define M13_A_3_5 8 -#define M13_A_3_6 9 -#define M13_A_3_7 10 -#define M13_A_3_8 11 -#define M13_A_3_9 12 -#define M13_A_3_10 0 -#define M13_A_3_11 1 -#define M13_A_3_12 2 -#define M13_A_4_0 4 -#define M13_A_4_1 5 -#define M13_A_4_2 6 -#define M13_A_4_3 7 -#define M13_A_4_4 8 -#define M13_A_4_5 9 -#define M13_A_4_6 10 -#define M13_A_4_7 11 -#define M13_A_4_8 12 -#define M13_A_4_9 0 -#define M13_A_4_10 1 -#define M13_A_4_11 2 -#define M13_A_4_12 3 -#define M13_A_5_0 5 -#define M13_A_5_1 6 -#define M13_A_5_2 7 -#define M13_A_5_3 8 -#define M13_A_5_4 9 -#define M13_A_5_5 10 -#define M13_A_5_6 11 -#define M13_A_5_7 12 -#define M13_A_5_8 0 -#define M13_A_5_9 1 -#define M13_A_5_10 2 -#define M13_A_5_11 3 -#define M13_A_5_12 4 -#define M13_A_6_0 6 -#define M13_A_6_1 7 -#define M13_A_6_2 8 -#define M13_A_6_3 9 -#define M13_A_6_4 10 -#define M13_A_6_5 11 -#define M13_A_6_6 12 -#define M13_A_6_7 0 -#define M13_A_6_8 1 -#define M13_A_6_9 2 -#define M13_A_6_10 3 -#define M13_A_6_11 4 -#define M13_A_6_12 5 -#define M13_A_7_0 7 -#define M13_A_7_1 8 -#define M13_A_7_2 9 -#define M13_A_7_3 10 -#define M13_A_7_4 11 -#define M13_A_7_5 12 -#define M13_A_7_6 0 -#define M13_A_7_7 1 -#define M13_A_7_8 2 -#define M13_A_7_9 3 -#define M13_A_7_10 4 -#define M13_A_7_11 5 -#define M13_A_7_12 6 -#define M13_A_8_0 8 -#define M13_A_8_1 9 -#define M13_A_8_2 10 -#define M13_A_8_3 11 -#define M13_A_8_4 12 -#define M13_A_8_5 0 -#define M13_A_8_6 1 -#define M13_A_8_7 2 -#define M13_A_8_8 3 -#define M13_A_8_9 4 -#define M13_A_8_10 5 -#define M13_A_8_11 6 -#define M13_A_8_12 7 -#define M13_A_9_0 9 -#define M13_A_9_1 10 -#define M13_A_9_2 11 -#define M13_A_9_3 12 -#define M13_A_9_4 0 -#define M13_A_9_5 1 -#define M13_A_9_6 2 -#define M13_A_9_7 3 -#define M13_A_9_8 4 -#define M13_A_9_9 5 -#define M13_A_9_10 6 -#define M13_A_9_11 7 -#define M13_A_9_12 8 -#define M13_A_10_0 10 -#define M13_A_10_1 11 -#define M13_A_10_2 12 -#define M13_A_10_3 0 -#define M13_A_10_4 1 -#define M13_A_10_5 2 -#define M13_A_10_6 3 -#define M13_A_10_7 4 -#define M13_A_10_8 5 -#define M13_A_10_9 6 -#define M13_A_10_10 7 -#define M13_A_10_11 8 -#define M13_A_10_12 9 -#define M13_A_11_0 11 -#define M13_A_11_1 12 -#define M13_A_11_2 0 -#define M13_A_11_3 1 -#define M13_A_11_4 2 -#define M13_A_11_5 3 -#define M13_A_11_6 4 -#define M13_A_11_7 5 -#define M13_A_11_8 6 -#define M13_A_11_9 7 -#define M13_A_11_10 8 -#define M13_A_11_11 9 -#define M13_A_11_12 10 -#define M13_A_12_0 12 -#define M13_A_12_1 0 -#define M13_A_12_2 1 -#define M13_A_12_3 2 -#define M13_A_12_4 3 -#define M13_A_12_5 4 -#define M13_A_12_6 5 -#define M13_A_12_7 6 -#define M13_A_12_8 7 -#define M13_A_12_9 8 -#define M13_A_12_10 9 -#define M13_A_12_11 10 -#define M13_A_12_12 11 - -#define M13_N(i) M13_N_(i) -#define M13_N_(i) M13_N_ ## i -#define M13_N_0 12 -#define M13_N_1 11 -#define M13_N_2 10 -#define M13_N_3 9 -#define M13_N_4 8 -#define M13_N_5 7 -#define M13_N_6 6 -#define M13_N_7 5 -#define M13_N_8 4 -#define M13_N_9 3 -#define M13_N_10 2 -#define M13_N_11 1 -#define M13_N_12 0 - -#define ACC_b(i, k) ACC_b_(i, k) -#define ACC_b_(i, k) b ## i ## _ ## k - -#define ROUND_ELT(k, s) do { \ - if ((bj += 3) == 39) \ - bj = 0; \ - sc->b[bj + s] ^= a ## k; \ - } while (0) - -#define ROUND_SF(j) do { \ - size_t bj = (j) * 3; \ - ROUND_ELT(1, 0); \ - ROUND_ELT(2, 1); \ - ROUND_ELT(3, 2); \ - ROUND_ELT(4, 0); \ - ROUND_ELT(5, 1); \ - ROUND_ELT(6, 2); \ - ROUND_ELT(7, 0); \ - ROUND_ELT(8, 1); \ - ROUND_ELT(9, 2); \ - ROUND_ELT(10, 0); \ - ROUND_ELT(11, 1); \ - ROUND_ELT(12, 2); \ - MILL; \ - bj = (j) * 3; \ - a ## 13 ^= sc->b[bj + 0]; \ - a ## 14 ^= sc->b[bj + 1]; \ - a ## 15 ^= sc->b[bj + 2]; \ - } while (0) - -#define INPUT_SF(j, p0, p1, p2) do { \ - size_t bj = ((j) + 1) * 3; \ - if (bj == 39) \ - bj = 0; \ - sc->b[bj + 0] ^= (p0); \ - sc->b[bj + 1] ^= (p1); \ - sc->b[bj + 2] ^= (p2); \ - a16 ^= (p0); \ - a17 ^= (p1); \ - a18 ^= (p2); \ - } while (0) - - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN - -#define ROUND ROUND_SF -#define INPUT INPUT_SF - -#else - -/* - * Round function R, on base j. The value j is such that B[0] is actually - * b[j] after the initial rotation. On the 13-round macro, j has the - * successive values 12, 11, 10... 1, 0. - */ -#define ROUND(j) do { \ - ACC_b(M13_A(1, j), 0) ^= a ## 1; \ - ACC_b(M13_A(2, j), 1) ^= a ## 2; \ - ACC_b(M13_A(3, j), 2) ^= a ## 3; \ - ACC_b(M13_A(4, j), 0) ^= a ## 4; \ - ACC_b(M13_A(5, j), 1) ^= a ## 5; \ - ACC_b(M13_A(6, j), 2) ^= a ## 6; \ - ACC_b(M13_A(7, j), 0) ^= a ## 7; \ - ACC_b(M13_A(8, j), 1) ^= a ## 8; \ - ACC_b(M13_A(9, j), 2) ^= a ## 9; \ - ACC_b(M13_A(10, j), 0) ^= a ## 10; \ - ACC_b(M13_A(11, j), 1) ^= a ## 11; \ - ACC_b(M13_A(12, j), 2) ^= a ## 12; \ - MILL; \ - a ## 13 ^= ACC_b(j, 0); \ - a ## 14 ^= ACC_b(j, 1); \ - a ## 15 ^= ACC_b(j, 2); \ - } while (0) - -#define INPUT(j, p0, p1, p2) do { \ - ACC_b(M13_A(1, j), 0) ^= (p0); \ - ACC_b(M13_A(1, j), 1) ^= (p1); \ - ACC_b(M13_A(1, j), 2) ^= (p2); \ - a16 ^= (p0); \ - a17 ^= (p1); \ - a18 ^= (p2); \ - } while (0) - -#endif - -#define MUL13(action) do { \ - action(0); \ - action(1); \ - action(2); \ - action(3); \ - action(4); \ - action(5); \ - action(6); \ - action(7); \ - action(8); \ - action(9); \ - action(10); \ - action(11); \ - action(12); \ - } while (0) - -#define MILL_READ_ELT(i) do { \ - a ## i = sc->a[i]; \ - } while (0) - -#define MILL_WRITE_ELT(i) do { \ - sc->a[i] = a ## i; \ - } while (0) - -#define STATE_READ_SF do { \ - MUL19(MILL_READ_ELT); \ - } while (0) - -#define STATE_WRITE_SF do { \ - MUL19(MILL_WRITE_ELT); \ - } while (0) - -#define PUSH13_SF do { \ - WT DECL19(a); \ - const unsigned char *buf; \ - \ - buf = data; \ - STATE_READ_SF; \ - while (len >= sizeof sc->data) { \ - size_t mk; \ - for (mk = 13; mk > 0; mk --) { \ - WT p0 = INW(0, 0); \ - WT p1 = INW(0, 1); \ - WT p2 = INW(0, 2); \ - INPUT_SF(mk - 1, p0, p1, p2); \ - ROUND_SF(mk - 1); \ - buf += (sizeof sc->data) / 13; \ - len -= (sizeof sc->data) / 13; \ - } \ - } \ - STATE_WRITE_SF; \ - return len; \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN - -#define STATE_READ STATE_READ_SF -#define STATE_WRITE STATE_WRITE_SF -#define PUSH13 PUSH13_SF - -#else - -#define BELT_READ_ELT(i) do { \ - b ## i ## _0 = sc->b[3 * i + 0]; \ - b ## i ## _1 = sc->b[3 * i + 1]; \ - b ## i ## _2 = sc->b[3 * i + 2]; \ - } while (0) - -#define BELT_WRITE_ELT(i) do { \ - sc->b[3 * i + 0] = b ## i ## _0; \ - sc->b[3 * i + 1] = b ## i ## _1; \ - sc->b[3 * i + 2] = b ## i ## _2; \ - } while (0) - -#define STATE_READ do { \ - MUL13(BELT_READ_ELT); \ - MUL19(MILL_READ_ELT); \ - } while (0) - -#define STATE_WRITE do { \ - MUL13(BELT_WRITE_ELT); \ - MUL19(MILL_WRITE_ELT); \ - } while (0) - -/* - * Input data by chunks of 13*3 blocks. This is the body of the - * radiogatun32_push13() and radiogatun64_push13() functions. - */ -#define PUSH13 do { \ - WT DECL19(a), DECL13(b); \ - const unsigned char *buf; \ - \ - buf = data; \ - STATE_READ; \ - while (len >= sizeof sc->data) { \ - WT p0, p1, p2; \ - MUL13(PUSH13_ELT); \ - buf += sizeof sc->data; \ - len -= sizeof sc->data; \ - } \ - STATE_WRITE; \ - return len; \ - } while (0) - -#define PUSH13_ELT(k) do { \ - p0 = INW(k, 0); \ - p1 = INW(k, 1); \ - p2 = INW(k, 2); \ - INPUT(M13_N(k), p0, p1, p2); \ - ROUND(M13_N(k)); \ - } while (0) - -#endif - -#define BLANK13_SF do { \ - size_t mk = 13; \ - while (mk -- > 0) \ - ROUND_SF(mk); \ - } while (0) - -#define BLANK1_SF do { \ - WT tmp0, tmp1, tmp2; \ - ROUND_SF(12); \ - tmp0 = sc->b[36]; \ - tmp1 = sc->b[37]; \ - tmp2 = sc->b[38]; \ - memmove(sc->b + 3, sc->b, 36 * sizeof sc->b[0]); \ - sc->b[0] = tmp0; \ - sc->b[1] = tmp1; \ - sc->b[2] = tmp2; \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN - -#define BLANK13 BLANK13_SF -#define BLANK1 BLANK1_SF - -#else - -/* - * Run 13 blank rounds. This macro expects the "a" and "b" state variables - * to be alread declared. - */ -#define BLANK13 MUL13(BLANK13_ELT) - -#define BLANK13_ELT(k) ROUND(M13_N(k)) - -#define MUL12(action) do { \ - action(0); \ - action(1); \ - action(2); \ - action(3); \ - action(4); \ - action(5); \ - action(6); \ - action(7); \ - action(8); \ - action(9); \ - action(10); \ - action(11); \ - } while (0) - -/* - * Run a single blank round, and physically rotate the belt. This is used - * for the last blank rounds, and the output rounds. This macro expects the - * "a" abd "b" state variables to be already declared. - */ -#define BLANK1 do { \ - WT tmp0, tmp1, tmp2; \ - ROUND(12); \ - tmp0 = b0_0; \ - tmp1 = b0_1; \ - tmp2 = b0_2; \ - MUL12(BLANK1_ELT); \ - b1_0 = tmp0; \ - b1_1 = tmp1; \ - b1_2 = tmp2; \ - } while (0) - -#define BLANK1_ELT(i) do { \ - ACC_b(M13_A(M13_N(i), 1), 0) = ACC_b(M13_N(i), 0); \ - ACC_b(M13_A(M13_N(i), 1), 1) = ACC_b(M13_N(i), 1); \ - ACC_b(M13_A(M13_N(i), 1), 2) = ACC_b(M13_N(i), 2); \ - } while (0) - -#endif - -#define NO_TOKEN - -/* - * Perform padding, then blank rounds, then output some words. This is - * the body of sph_radiogatun32_close() and sph_radiogatun64_close(). - */ -#define CLOSE_SF(width) CLOSE_GEN(width, \ - NO_TOKEN, STATE_READ_SF, BLANK1_SF, BLANK13_SF) - -#if SPH_SMALL_FOOTPRINT_RADIOGATUN -#define CLOSE CLOSE_SF -#else -#define CLOSE(width) CLOSE_GEN(width, \ - WT DECL13(b);, STATE_READ, BLANK1, BLANK13) -#endif - -#define CLOSE_GEN(width, WTb13, state_read, blank1, blank13) do { \ - unsigned ptr, num; \ - unsigned char *out; \ - WT DECL19(a); \ - WTb13 \ - \ - ptr = sc->data_ptr; \ - sc->data[ptr ++] = 0x01; \ - memset(sc->data + ptr, 0, (sizeof sc->data) - ptr); \ - radiogatun ## width ## _push13(sc, sc->data, sizeof sc->data); \ - \ - num = 17; \ - for (;;) { \ - ptr += 3 * (width >> 3); \ - if (ptr > sizeof sc->data) \ - break; \ - num --; \ - } \ - \ - state_read; \ - if (num >= 13) { \ - blank13; \ - num -= 13; \ - } \ - while (num -- > 0) \ - blank1; \ - \ - num = 0; \ - out = dst; \ - for (;;) { \ - OUTW(out, a1); \ - out += width >> 3; \ - OUTW(out, a2); \ - out += width >> 3; \ - num += 2 * (width >> 3); \ - if (num >= 32) \ - break; \ - blank1; \ - } \ - INIT; \ - } while (0) - -/* - * Initialize context structure. - */ -#if SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN - -#define INIT do { \ - memset(sc->a, 0, sizeof sc->a); \ - memset(sc->b, 0, sizeof sc->b); \ - sc->data_ptr = 0; \ - } while (0) - -#else - -#define INIT do { \ - size_t u; \ - for (u = 0; u < 19; u ++) \ - sc->a[u] = 0; \ - for (u = 0; u < 39; u ++) \ - sc->b[u] = 0; \ - sc->data_ptr = 0; \ - } while (0) - -#endif - -/* ======================================================================= */ -/* - * RadioGatun[32]. - */ - -#if !SPH_NO_RG32 - -#undef WT -#define WT sph_u32 -#undef T -#define T SPH_T32 -#undef ROR -#define ROR(x, n) SPH_T32(((x) << ((32 - (n)) & 31)) | ((x) >> ((n) & 31))) -#undef INW -#define INW(i, j) sph_dec32le_aligned(buf + (4 * (3 * (i) + (j)))) -#undef OUTW -#define OUTW(b, v) sph_enc32le(b, v) - -/* - * Insert data by big chunks of 13*12 = 156 bytes. Returned value is the - * number of remaining bytes (between 0 and 155). This method assumes that - * the input data is suitably aligned. - */ -static size_t -radiogatun32_push13(sph_radiogatun32_context *sc, const void *data, size_t len) -{ - PUSH13; -} - -/* see sph_radiogatun.h */ -void -sph_radiogatun32_init(void *cc) -{ - sph_radiogatun32_context *sc; - - sc = cc; - INIT; -} - -#ifdef SPH_UPTR -static void -radiogatun32_short(void *cc, const void *data, size_t len) -#else -/* see sph_radiogatun.h */ -void -sph_radiogatun32(void *cc, const void *data, size_t len) -#endif -{ - sph_radiogatun32_context *sc; - unsigned ptr; - - sc = cc; - ptr = sc->data_ptr; - while (len > 0) { - size_t clen; - - clen = (sizeof sc->data) - ptr; - if (clen > len) - clen = len; - memcpy(sc->data + ptr, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - ptr += clen; - if (ptr == sizeof sc->data) { - radiogatun32_push13(sc, sc->data, sizeof sc->data); - ptr = 0; - } - } - sc->data_ptr = ptr; -} - -#ifdef SPH_UPTR -/* see sph_radiogatun.h */ -void -sph_radiogatun32(void *cc, const void *data, size_t len) -{ - sph_radiogatun32_context *sc; - unsigned ptr; - size_t rlen; - - if (len < (2 * sizeof sc->data)) { - radiogatun32_short(cc, data, len); - return; - } - sc = cc; - ptr = sc->data_ptr; - if (ptr > 0) { - unsigned t; - - t = (sizeof sc->data) - ptr; - radiogatun32_short(sc, data, t); - data = (const unsigned char *)data + t; - len -= t; - } -#if !SPH_UNALIGNED - if (((SPH_UPTR)data & 3) != 0) { - radiogatun32_short(sc, data, len); - return; - } -#endif - rlen = radiogatun32_push13(sc, data, len); - memcpy(sc->data, (const unsigned char *)data + len - rlen, rlen); - sc->data_ptr = rlen; -} -#endif - -/* see sph_radiogatun.h */ -void -sph_radiogatun32_close(void *cc, void *dst) -{ - sph_radiogatun32_context *sc; - - sc = cc; - CLOSE(32); -} - -#endif - -/* ======================================================================= */ -/* - * RadioGatun[64]. Compiled only if a 64-bit or more type is available. - */ - -#if SPH_64 - -#if !SPH_NO_RG64 - -#undef WT -#define WT sph_u64 -#undef T -#define T SPH_T64 -#undef ROR -#define ROR(x, n) SPH_T64(((x) << ((64 - (n)) & 63)) | ((x) >> ((n) & 63))) -#undef INW -#define INW(i, j) sph_dec64le_aligned(buf + (8 * (3 * (i) + (j)))) -#undef OUTW -#define OUTW(b, v) sph_enc64le(b, v) - -/* - * On 32-bit x86, register pressure is such that using the small - * footprint version is a net gain (x2 speed), because that variant - * uses fewer local variables. - */ -#if SPH_I386_MSVC || SPH_I386_GCC || defined __i386__ -#undef PUSH13 -#define PUSH13 PUSH13_SF -#undef CLOSE -#define CLOSE CLOSE_SF -#endif - -/* - * Insert data by big chunks of 13*24 = 312 bytes. Returned value is the - * number of remaining bytes (between 0 and 311). This method assumes that - * the input data is suitably aligned. - */ -static size_t -radiogatun64_push13(sph_radiogatun64_context *sc, const void *data, size_t len) -{ - PUSH13; -} - -/* see sph_radiogatun.h */ -void -sph_radiogatun64_init(void *cc) -{ - sph_radiogatun64_context *sc; - - sc = cc; - INIT; -} - -#ifdef SPH_UPTR -static void -radiogatun64_short(void *cc, const void *data, size_t len) -#else -/* see sph_radiogatun.h */ -void -sph_radiogatun64(void *cc, const void *data, size_t len) -#endif -{ - sph_radiogatun64_context *sc; - unsigned ptr; - - sc = cc; - ptr = sc->data_ptr; - while (len > 0) { - size_t clen; - - clen = (sizeof sc->data) - ptr; - if (clen > len) - clen = len; - memcpy(sc->data + ptr, data, clen); - data = (const unsigned char *)data + clen; - len -= clen; - ptr += clen; - if (ptr == sizeof sc->data) { - radiogatun64_push13(sc, sc->data, sizeof sc->data); - ptr = 0; - } - } - sc->data_ptr = ptr; -} - -#ifdef SPH_UPTR -/* see sph_radiogatun.h */ -void -sph_radiogatun64(void *cc, const void *data, size_t len) -{ - sph_radiogatun64_context *sc; - unsigned ptr; - size_t rlen; - - if (len < (2 * sizeof sc->data)) { - radiogatun64_short(cc, data, len); - return; - } - sc = cc; - ptr = sc->data_ptr; - if (ptr > 0) { - unsigned t; - - t = (sizeof sc->data) - ptr; - radiogatun64_short(sc, data, t); - data = (const unsigned char *)data + t; - len -= t; - } -#if !SPH_UNALIGNED - if (((SPH_UPTR)data & 7) != 0) { - radiogatun64_short(sc, data, len); - return; - } -#endif - rlen = radiogatun64_push13(sc, data, len); - memcpy(sc->data, (const unsigned char *)data + len - rlen, rlen); - sc->data_ptr = rlen; -} -#endif - -/* see sph_radiogatun.h */ -void -sph_radiogatun64_close(void *cc, void *dst) -{ - sph_radiogatun64_context *sc; - - sc = cc; - CLOSE(64); -} - -#endif - -#endif diff --git a/algo/radiogatun/sph_radiogatun.h b/algo/radiogatun/sph_radiogatun.h deleted file mode 100644 index 4e3888c8..00000000 --- a/algo/radiogatun/sph_radiogatun.h +++ /dev/null @@ -1,186 +0,0 @@ -/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */ -/** - * RadioGatun interface. - * - * RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters - * and G. Van Assche, "RadioGatun, a belt-and-mill hash function", - * presented at the Second Cryptographic Hash Workshop, Santa Barbara, - * August 24-25, 2006. The main Web site, containing that article, the - * reference code and some test vectors, appears to be currently located - * at the following URL: http://radiogatun.noekeon.org/ - * - * The presentation article does not specify endianness or padding. The - * reference code uses the following conventions, which we also apply - * here: - *
    - *
  • The input message is an integral number of sequences of three - * words. Each word is either a 32-bit of 64-bit word (depending on - * the version of RadioGatun).
  • - *
  • Input bytes are decoded into words using little-endian - * convention.
  • - *
  • Padding consists of a single bit of value 1, using little-endian - * convention within bytes (i.e. for a byte-oriented input, a single - * byte of value 0x01 is appended), then enough bits of value 0 to finish - * the current block.
  • - *
  • Output consists of 256 bits. Successive output words are encoded - * with little-endian convention.
  • - *
- * These conventions are very close to those we use for PANAMA, which is - * a close ancestor or RadioGatun. - * - * RadioGatun is actually a family of functions, depending on some - * internal parameters. We implement here two functions, with a "belt - * length" of 13, a "belt width" of 3, and a "mill length" of 19. The - * RadioGatun[32] version uses 32-bit words, while the RadioGatun[64] - * variant uses 64-bit words. - * - * Strictly speaking, the name "RadioGatun" should use an acute accent - * on the "u", which we omitted here to keep strict ASCII-compatibility - * of this file. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_radiogatun.h - * @author Thomas Pornin - */ - -#ifndef SPH_RADIOGATUN_H__ -#define SPH_RADIOGATUN_H__ - -#include -#include "algo/sha/sph_types.h" - -/** - * Output size (in bits) for RadioGatun[32]. - */ -#define SPH_SIZE_radiogatun32 256 - -/** - * This structure is a context for RadioGatun[32] computations: it - * contains intermediate values and some data from the last entered - * block. Once a RadioGatun[32] computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running RadioGatun[32] - * computation can be cloned by copying the context (e.g. with a - * simple memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char data[156]; /* first field, for alignment */ - unsigned data_ptr; - sph_u32 a[19], b[39]; -#endif -} sph_radiogatun32_context; - -/** - * Initialize a RadioGatun[32] context. This process performs no - * memory allocation. - * - * @param cc the RadioGatun[32] context (pointer to a - * sph_radiogatun32_context) - */ -void sph_radiogatun32_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RadioGatun[32] context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_radiogatun32(void *cc, const void *data, size_t len); - -/** - * Terminate the current RadioGatun[32] computation and output the - * result into the provided buffer. The destination buffer must be wide - * enough to accomodate the result (32 bytes). The context is - * automatically reinitialized. - * - * @param cc the RadioGatun[32] context - * @param dst the destination buffer - */ -void sph_radiogatun32_close(void *cc, void *dst); - -#if SPH_64 - -/** - * Output size (in bits) for RadioGatun[64]. - */ -#define SPH_SIZE_radiogatun64 256 - -/** - * This structure is a context for RadioGatun[64] computations: it - * contains intermediate values and some data from the last entered - * block. Once a RadioGatun[64] computation has been performed, the - * context can be reused for another computation. - * - * The contents of this structure are private. A running RadioGatun[64] - * computation can be cloned by copying the context (e.g. with a - * simple memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - unsigned char data[312]; /* first field, for alignment */ - unsigned data_ptr; - sph_u64 a[19], b[39]; -#endif -} sph_radiogatun64_context; - -/** - * Initialize a RadioGatun[64] context. This process performs no - * memory allocation. - * - * @param cc the RadioGatun[64] context (pointer to a - * sph_radiogatun64_context) - */ -void sph_radiogatun64_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the RadioGatun[64] context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_radiogatun64(void *cc, const void *data, size_t len); - -/** - * Terminate the current RadioGatun[64] computation and output the - * result into the provided buffer. The destination buffer must be wide - * enough to accomodate the result (32 bytes). The context is - * automatically reinitialized. - * - * @param cc the RadioGatun[64] context - * @param dst the destination buffer - */ -void sph_radiogatun64_close(void *cc, void *dst); - -#endif - -#endif diff --git a/algo/x20/x20r-gate.c b/algo/x20/x20r-gate.c deleted file mode 100644 index 48aa85b3..00000000 --- a/algo/x20/x20r-gate.c +++ /dev/null @@ -1,34 +0,0 @@ -#include "x20r-gate.h" - -void getAlgoString( const uint8_t* prevblock, char *output ) -{ - char *sptr = outpuit; - - for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ ) - { - char b = (19 - j) >> 1; // 16 ascii hex chars, reversed - uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4; - if (algoDigit >= 10) - sprintf(sptr, "%c", 'A' + (algoDigit - 10)); - else - sprintf(sptr, "%u", (uint32_t) algoDigit); - sptr++; - } - *sptr = '\0'; -} - -bool register_x20r_algo( algo_gate_t* gate ) -{ -#if defined (X20R_4WAY) - gate->scanhash = (void*)&scanhash_x20r_4way; - gate->hash = (void*)&x20r_4way_hash; -#else - gate->scanhash = (void*)&scanhash_x20r; - gate->hash = (void*)&x20r_hash; -#endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; - x20_r_s_getAlgoString = (void*)&x20r_getAlgoString; - opt_target_factor = 256.; - return true; -}; - diff --git a/algo/x20/x20r-gate.h b/algo/x20/x20r-gate.h deleted file mode 100644 index 07e975ad..00000000 --- a/algo/x20/x20r-gate.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef X20R_GATE_H__ -#define X20R_GATE_H__ 1 - -#include "algo-gate-api.h" -#include - -/* -#if defined(__AVX2__) && defined(__AES__) - #define X20R_4WAY -#endif -*/ - -enum x20r_Algo { - BLAKE = 0, - BMW, - GROESTL, - JH, - KECCAK, - SKEIN, - LUFFA, - CUBEHASH, - SHAVITE, - SIMD, - ECHO, - HAMSI, - FUGUE, - SHABAL, - WHIRLPOOL, - SHA_512, - HAVAL, // 256-bits output - GOST, - RADIOGATUN, // 256-bits output - PANAMA, // 256-bits output - X20R_HASH_FUNC_COUNT -}; - -void (*x20_r_s_getAlgoString) ( const uint8_t*, char* ); - -void x20r_getAlgoString( const uint8_t* prevblock, char *output ); - -bool register_xi20r_algo( algo_gate_t* gate ); - -#if defined(X20R_4WAY) - -void x20r_4way_hash( void *state, const void *input ); - -int scanhash_x20r_4way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - -void x20rhash( void *state, const void *input ); - -int scanhash_x20r( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - -#endif - diff --git a/algo/x20/x20r.c b/algo/x20/x20r.c deleted file mode 100644 index 8c3d43e7..00000000 --- a/algo/x20/x20r.c +++ /dev/null @@ -1,252 +0,0 @@ -#include "x20r-gate.h" - -#include -#include -#include - -#include "algo/blake/sph_blake.h" -#include "algo/bmw/sph_bmw.h" -#include "algo/jh/sph_jh.h" -#include "algo/keccak/sph_keccak.h" -#include "algo/skein/sph_skein.h" -#include "algo/shavite/sph_shavite.h" -#include "algo/hamsi/sph_hamsi.h" -#include "algo/fugue/sph_fugue.h" -#include "algo/shabal/sph_shabal.h" -#include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include "algo/radiogatun/sph_radiogatun.h" -#include "algo/panama/sph_panama.h" -#include "algo/gost/sph_gost.h" -#include "algo/sha/sph_sha2.h" -#if defined(__AES__) - #include "algo/echo/aes_ni/hash_api.h" - #include "algo/groestl/aes_ni/hash-groestl.h" -#else - #include "algo/groestl/sph_groestl.h" - #include "algo/echo/sph_echo.h" -#endif -#include "algo/luffa/luffa_for_sse2.h" -#include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" - - -static __thread uint32_t s_ntime = UINT32_MAX; -static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 }; - -union _x20r_context_overlay -{ - sph_blake512_context blake; - sph_bmw512_context bmw; -#if defined(__AES__) - hashState_groestl groestl; - hashState_echo echo; -#else - sph_groestl512_context groestl; - sph_echo512_context echo; -#endif - sph_skein512_context skein; - sph_jh512_context jh; - sph_keccak512_context keccak; - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; - sph_shavite512_context shavite; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; - sph_sha512_context sha512; - sph_haval256_5_context haval; - sph_gost512_context gost; - sph_radiogatun64_context radiogatun; - sph_panama_context panama; -}; -typedef union _x20r_context_overlay x20r_context_overlay; - -void x20r_hash(void* output, const void* input) -{ - uint32_t _ALIGN(128) hash[64/4]; - x20r_context_overlay ctx; - void *in = (void*) input; - int size = 80; - - if ( s_ntime == UINT32_MAX ) - { - const uint8_t* in8 = (uint8_t*) input; - x20_r_s_getAlgoString(&in8[4], hashOrder); - } - - for (int i = 0; i < 20; i++) - { - const char elem = hashOrder[i]; - const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - - switch ( algo ) - { - case BLAKE: - sph_blake512_init(&ctx.blake); - sph_blake512(&ctx.blake, in, size); - sph_blake512_close(&ctx.blake, hash); - break; - case BMW: - sph_bmw512_init(&ctx.bmw); - sph_bmw512(&ctx.bmw, in, size); - sph_bmw512_close(&ctx.bmw, hash); - break; - case GROESTL: -#if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); -#else - sph_groestl512_init(&ctx.groestl); - sph_groestl512(&ctx.groestl, in, size); - sph_groestl512_close(&ctx.groestl, hash); -#endif - break; - case SKEIN: - sph_skein512_init(&ctx.skein); - sph_skein512(&ctx.skein, in, size); - sph_skein512_close(&ctx.skein, hash); - break; - case JH: - sph_jh512_init(&ctx.jh); - sph_jh512(&ctx.jh, in, size); - sph_jh512_close(&ctx.jh, hash); - break; - case KECCAK: - sph_keccak512_init(&ctx.keccak); - sph_keccak512(&ctx.keccak, in, size); - sph_keccak512_close(&ctx.keccak, hash); - break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); - break; - case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); - break; - case SHAVITE: - sph_shavite512_init(&ctx.shavite); - sph_shavite512(&ctx.shavite, in, size); - sph_shavite512_close(&ctx.shavite, hash); - break; - case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)in, size<<3 ); - break; - case ECHO: -#if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)in, size<<3 ); -#else - sph_echo512_init(&ctx.echo); - sph_echo512(&ctx.echo, in, size); - sph_echo512_close(&ctx.echo, hash); -#endif - break; - case HAMSI: - sph_hamsi512_init(&ctx.hamsi); - sph_hamsi512(&ctx.hamsi, in, size); - sph_hamsi512_close(&ctx.hamsi, hash); - break; - case FUGUE: - sph_fugue512_init(&ctx.fugue); - sph_fugue512(&ctx.fugue, in, size); - sph_fugue512_close(&ctx.fugue, hash); - break; - case SHABAL: - sph_shabal512_init(&ctx.shabal); - sph_shabal512(&ctx.shabal, in, size); - sph_shabal512_close(&ctx.shabal, hash); - break; - case WHIRLPOOL: - sph_whirlpool_init(&ctx.whirlpool); - sph_whirlpool(&ctx.whirlpool, in, size); - sph_whirlpool_close(&ctx.whirlpool, hash); - break; - case SHA_512: - sph_sha512_Init( &ctx.sha512 ); - sph_sha512( &ctx.sha512, in, size ); - sph_sha512_close( &ctx.sha512, hash ); - break; - case HAVAL: - sph_haval256_5_init(&ctx.haval); - sph_haval256_5(&ctx.haval, in, size); - sph_haval256_5_close(&ctx.haval, hash); - memset(&hash[8], 0, 32); - break; - case GOST: - sph_gost512_init(&ctx.gost); - sph_gost512(&ctx.gost, in, size); - sph_gost512_close(&ctx.gost, hash); - break; - case RADIOGATUN: - sph_radiogatun64_init(&ctx.radiogatun); - sph_radiogatun64(&ctx.radiogatun, in, size); - sph_radiogatun64_close(&ctx.radiogatun, hash); - memset(&hash[8], 0, 32); - break; - case PANAMA: - sph_panama_init(&ctx.panama); - sph_panama(&ctx.panama, in, size); - sph_panama_close(&ctx.panama, hash); - memset(&hash[8], 0, 32); - break; - } - in = (void*) hash; - size = 64; - } - memcpy(output, hash, 32); -} - -int scanhash_x20r( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; - volatile uint8_t *restart = &(work_restart[thr_id].restart); - - for (int k=0; k < 19; k++) - be32enc( &endiandata[k], pdata[k] ); - - if ( s_ntime != pdata[17] ) - { - uint32_t ntime = swab32(pdata[17]); - x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder ); - s_ntime = ntime; - if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); - } - - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - do { - be32enc( &endiandata[19], nonce ); - x20r_hash( hash32, endiandata ); - - if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) ) - { - pdata[19] = nonce; - submit_solution( work, hash32, mythr ); - } - nonce++; - - } while (nonce < max_nonce && !(*restart)); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} diff --git a/algo/yescrypt/yescrypt-best.c b/algo/yescrypt/yescrypt-best.c deleted file mode 100644 index 4e836215..00000000 --- a/algo/yescrypt/yescrypt-best.c +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef __SSE2__ -#include "yescrypt-simd.c" -#else -#include "yescrypt-opt.c" -#endif diff --git a/algo/yescrypt/yescrypt-platform.h b/algo/yescrypt/yescrypt-platform.h deleted file mode 100644 index bf6df915..00000000 --- a/algo/yescrypt/yescrypt-platform.h +++ /dev/null @@ -1,213 +0,0 @@ -/*- - * Copyright 2013,2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifdef MAP_ANON -#include -#endif - -#include "yescrypt.h" -#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) - -#ifdef __x86_64__ -#define HUGEPAGE_SIZE (2 * 1024 * 1024) -#else -#undef HUGEPAGE_SIZE -#endif - -/* -static __inline uint32_t -le32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} - -static __inline void -le32enc(void *pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} -*/ - -static void * -alloc_region(yescrypt_region_t * region, size_t size) -{ - size_t base_size = size; - uint8_t * base, * aligned; -#ifdef MAP_ANON - int flags = -#ifdef MAP_NOCORE - MAP_NOCORE | -#endif - MAP_ANON | MAP_PRIVATE; -#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) - size_t new_size = size; - const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; - if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { - flags |= MAP_HUGETLB; -/* - * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of - * huge page size, so let's round up to huge page size here. - */ - new_size = size + hugepage_mask; - new_size &= ~hugepage_mask; - } - base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (base != MAP_FAILED) { - base_size = new_size; - } else - if (flags & MAP_HUGETLB) { - flags &= ~MAP_HUGETLB; - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); - } - -#else - base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); -#endif - if (base == MAP_FAILED) - base = NULL; - aligned = base; -#elif defined(HAVE_POSIX_MEMALIGN) - if ((errno = posix_memalign((void **)&base, 64, size)) != 0) - base = NULL; - aligned = base; -#else - base = aligned = NULL; - if (size + 63 < size) { - errno = ENOMEM; - } else if ((base = malloc(size + 63)) != NULL) { - aligned = base + 63; - aligned -= (uintptr_t)aligned & 63; - } -#endif - region->base = base; - region->aligned = aligned; - region->base_size = base ? base_size : 0; - region->aligned_size = base ? size : 0; - return aligned; -} - -static __inline void -init_region(yescrypt_region_t * region) -{ - region->base = region->aligned = NULL; - region->base_size = region->aligned_size = 0; -} - -static int -free_region(yescrypt_region_t * region) -{ - if (region->base) { -#ifdef MAP_ANON - if (munmap(region->base, region->base_size)) - return -1; -#else - free(region->base); -#endif - } - init_region(region); - return 0; -} - -int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen, - uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask, - uint8_t * buf, size_t buflen) -{ - yescrypt_shared1_t* shared1 = &shared->shared1; - yescrypt_shared_t dummy, half1, half2; - uint8_t salt[32]; - - if (flags & YESCRYPT_SHARED_PREALLOCATED) { - if (!shared1->aligned || !shared1->aligned_size) - return -1; - } else { - init_region(shared1); - } - shared->mask1 = 1; - if (!param && !paramlen && !N && !r && !p && !buf && !buflen) - return 0; - - init_region(&dummy.shared1); - dummy.mask1 = 1; - if (yescrypt_kdf(&dummy, shared1, - param, paramlen, NULL, 0, N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt), 0 ) ) - goto out; - - half1 = half2 = *shared; - half1.shared1.aligned_size /= 2; - half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size); - half2.shared1.aligned_size = half1.shared1.aligned_size; - N /= 2; - - if (p > 1 && yescrypt_kdf(&half1, &half2.shared1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2, - salt, sizeof(salt), 0 )) - goto out; - - if (yescrypt_kdf(&half2, &half1.shared1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, - salt, sizeof(salt), 0)) - goto out; - - if (yescrypt_kdf(&half1, &half2.shared1, - param, paramlen, salt, sizeof(salt), N, r, p, 0, - YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, - buf, buflen, 0)) - goto out; - - shared->mask1 = mask; - - return 0; - -out: - if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) - free_region(shared1); - return -1; -} - -int -yescrypt_free_shared(yescrypt_shared_t * shared) -{ - return free_region(&shared->shared1); -} - -int -yescrypt_init_local(yescrypt_local_t * local) -{ - init_region(local); - return 0; -} - -int -yescrypt_free_local(yescrypt_local_t * local) -{ - return free_region(local); -} diff --git a/algo/yescrypt/yescrypt-simd.c b/algo/yescrypt/yescrypt-simd.c deleted file mode 100644 index 0cbb528d..00000000 --- a/algo/yescrypt/yescrypt-simd.c +++ /dev/null @@ -1,1392 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2012-2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -/* - * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding - * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX - * and XOP are of further help either way. - */ -/* -#ifndef __SSE4_1__ -#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance" -#endif -*/ - -#include -#ifdef __XOP__ -#include -#endif - -#include -#include -#include -#include -#include "algo/sha/hmac-sha256-hash.h" -#include "yescrypt.h" -#include "yescrypt-platform.h" - -#include "compat.h" - -#if __STDC_VERSION__ >= 199901L -/* have restrict */ -#elif defined(__GNUC__) -#define restrict __restrict -#else -#define restrict -#endif - -#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); -#define PREFETCH_OUT(x, hint) /* disabled */ - -#ifdef __XOP__ -#define ARX(out, in1, in2, s) \ - out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); -#else -#define ARX(out, in1, in2, s) \ - { \ - __m128i T = _mm_add_epi32(in1, in2); \ - out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ - out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \ - } -#endif - -#define SALSA20_2ROUNDS \ - /* Operate on "columns" */ \ - ARX(X1, X0, X3, 7) \ - ARX(X2, X1, X0, 9) \ - ARX(X3, X2, X1, 13) \ - ARX(X0, X3, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x93); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x39); \ -\ - /* Operate on "rows" */ \ - ARX(X3, X0, X1, 7) \ - ARX(X2, X3, X0, 9) \ - ARX(X1, X2, X3, 13) \ - ARX(X0, X1, X2, 18) \ -\ - /* Rearrange data */ \ - X1 = _mm_shuffle_epi32(X1, 0x39); \ - X2 = _mm_shuffle_epi32(X2, 0x4E); \ - X3 = _mm_shuffle_epi32(X3, 0x93); - -/** - * Apply the salsa20/8 core to the block provided in (X0 ... X3). - */ -#define SALSA20_8_BASE(maybe_decl, out) \ - { \ - maybe_decl Y0 = X0; \ - maybe_decl Y1 = X1; \ - maybe_decl Y2 = X2; \ - maybe_decl Y3 = X3; \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - SALSA20_2ROUNDS \ - (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ - (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ - (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ - (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ - } -#define SALSA20_8(out) \ - SALSA20_8_BASE(__m128i, out) - -/** - * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). - */ -#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ - X0 = _mm_xor_si128(X0, Z0); \ - X1 = _mm_xor_si128(X1, Z1); \ - X2 = _mm_xor_si128(X2, Z2); \ - X3 = _mm_xor_si128(X3, Z3); \ - SALSA20_8_BASE(maybe_decl, out) - -#define SALSA20_8_XOR_MEM(in, out) \ - SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out) - -#define SALSA20_8_XOR_REG(out) \ - SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) - -typedef union { - uint32_t w[16]; - __m128i q[4]; -} salsa20_blk_t; - -/** - * blockmix_salsa8(Bin, Bout, r): - * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r - * bytes in length; the output Bout must also be the same size. - */ -static inline void -blockmix_salsa8(const salsa20_blk_t *restrict Bin, - salsa20_blk_t *restrict Bout, size_t r) -{ - __m128i X0, X1, X2, X3; - size_t i; - - r--; - PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - X0 = Bin[r * 2 + 1].q[0]; - X1 = Bin[r * 2 + 1].q[1]; - X2 = Bin[r * 2 + 1].q[2]; - X3 = Bin[r * 2 + 1].q[3]; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q) -} - -/* - * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs - * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and - * destination registers, whereas the shifts would require an extra move - * instruction for our code when building without AVX. Unfortunately, PSHUFD - * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) - * and somewhat slower on some non-Intel CPUs (luckily not including AMD - * Bulldozer and Piledriver). Since for many other CPUs using (V)PSHUFD is a - * win in terms of throughput or/and not needing a move instruction, we - * currently use it despite of the higher latency on some older CPUs. As an - * alternative, the #if below may be patched to only enable use of (V)PSHUFD - * when building with SSE4.1 or newer, which is not available on older CPUs - * where this instruction has higher latency. - */ -#if 1 -#define HI32(X) \ - _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) -#elif 0 -#define HI32(X) \ - _mm_srli_si128((X), 4) -#else -#define HI32(X) \ - _mm_srli_epi64((X), 32) -#endif - -#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__)) -/* Intel's name, also supported by recent gcc */ -#define EXTRACT64(X) _mm_cvtsi128_si64(X) -#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) -/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */ -#define EXTRACT64(X) _mm_cvtsi128_si64x(X) -#elif defined(__x86_64__) && defined(__SSE4_1__) -/* No known bugs for this intrinsic */ -#include -#define EXTRACT64(X) _mm_extract_epi64((X), 0) -#elif defined(__SSE4_1__) -/* 32-bit */ -#include -#if 0 -/* This is currently unused by the code below, which instead uses these two - * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) -#endif -#else -/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */ -#define EXTRACT64(X) \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ - ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) -#endif - -/* This is tunable */ -#define S_BITS 8 - -/* Not tunable in this implementation, hard-coded in a few places */ -#define S_SIMD 2 -#define S_P 4 - -/* Number of S-boxes. Not tunable by design, hard-coded in a few places. */ -#define S_N 2 - -/* Derived values. Not tunable except via S_BITS above. */ -#define S_SIZE1 (1 << S_BITS) -#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) -#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) -#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8) - -#if !defined(__x86_64__) && defined(__SSE4_1__) -/* 32-bit with SSE4.1 */ -#define PWXFORM_X_T __m128i -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = _mm_and_si128(X, _mm_set1_epi64x(S_MASK2)); \ - s0 = *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ - s1 = *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); -#else -/* 64-bit, or 32-bit without SSE4.1 */ -#define PWXFORM_X_T uint64_t -#define PWXFORM_SIMD(X, x, s0, s1) \ - x = EXTRACT64(X) & S_MASK2; \ - s0 = *(const __m128i *)(S0 + (uint32_t)x); \ - s1 = *(const __m128i *)(S1 + (x >> 32)); \ - X = _mm_mul_epu32(HI32(X), X); \ - X = _mm_add_epi64(X, s0); \ - X = _mm_xor_si128(X, s1); -#endif - -#define PWXFORM_ROUND \ - PWXFORM_SIMD(X0, x0, s00, s01) \ - PWXFORM_SIMD(X1, x1, s10, s11) \ - PWXFORM_SIMD(X2, x2, s20, s21) \ - PWXFORM_SIMD(X3, x3, s30, s31) - -#define PWXFORM \ - { \ - PWXFORM_X_T x0, x1, x2, x3; \ - __m128i s00, s01, s10, s11, s20, s21, s30, s31; \ - PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_ROUND \ - } - -#define XOR4(in) \ - X0 = _mm_xor_si128(X0, (in)[0]); \ - X1 = _mm_xor_si128(X1, (in)[1]); \ - X2 = _mm_xor_si128(X2, (in)[2]); \ - X3 = _mm_xor_si128(X3, (in)[3]); - -#define XOUT(out) \ - (out)[0] = X0; \ - (out)[1] = X1; \ - (out)[2] = X2; \ - (out)[3] = X3; - -/** - * blockmix_pwxform(Bin, Bout, r, S): - * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must - * be 128r bytes in length; the output Bout must also be the same size. - */ -static void -blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, - size_t r, const __m128i *restrict S) -{ - const uint8_t * S0, * S1; - __m128i X0, X1, X2, X3; - size_t i; - - if (!S) { - blockmix_salsa8(Bin, Bout, r); - return; - } - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - PREFETCH(&Bin[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - - /* X <-- B_{r1 - 1} */ - X0 = Bin[r].q[0]; - X1 = Bin[r].q[1]; - X2 = Bin[r].q[2]; - X3 = Bin[r].q[3]; - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - /* X <-- H'(X \xor B_i) */ - XOR4(Bin[i].q) - PWXFORM - /* B'_i <-- X */ - XOUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin[i].q) - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) -} - -#define XOR4_2(in1, in2) \ - X0 = _mm_xor_si128((in1)[0], (in2)[0]); \ - X1 = _mm_xor_si128((in1)[1], (in2)[1]); \ - X2 = _mm_xor_si128((in1)[2], (in2)[2]); \ - X3 = _mm_xor_si128((in1)[3], (in2)[3]); - -static inline uint32_t -blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM) -{ - __m128i X0, X1, X2, X3; - size_t i; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_NTA) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - } else { - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - } - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q) - SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q) - SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q) - - return _mm_cvtsi128_si32(X0); -} - -static uint32_t -blockmix_xor(const salsa20_blk_t *restrict Bin1, - const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, int Bin2_in_ROM, const __m128i *restrict S) -{ - const uint8_t * S0, * S1; - __m128i X0, X1, X2, X3; - size_t i; - - if (!S) - return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM); - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - if (Bin2_in_ROM) { - PREFETCH(&Bin2[r], _MM_HINT_NTA) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_NTA) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - } else { - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0); - - /* X <-- B_{r1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - /* X <-- H'(X \xor B_i) */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - PWXFORM - /* B'_i <-- X */ - XOUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) - - return _mm_cvtsi128_si32(X0); -} - -#undef XOR4 -#define XOR4(in, out) \ - (out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \ - (out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \ - (out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \ - (out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]); - -static inline uint32_t -blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r) -{ - __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; - size_t i; - - r--; - PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i * 2], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2], _MM_HINT_T0) - PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) - PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) - } - PREFETCH(&Bin2[r * 2], _MM_HINT_T0) - PREFETCH(&Bin1[r * 2], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r], _MM_HINT_T0) - PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) - - /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q, Bin2[0].q) - SALSA20_8_XOR_REG(Bout[0].q) - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r;) { - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q) - SALSA20_8_XOR_REG(Bout[r + 1 + i].q) - - i++; - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q, Bin2[i * 2].q) - SALSA20_8_XOR_REG(Bout[i].q) - } - - /* 3: X <-- H(X \xor B_i) */ - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) - SALSA20_8_XOR_REG(Bout[r * 2 + 1].q) - - return _mm_cvtsi128_si32(X0); -} - -#define XOR4_Y \ - X0 = _mm_xor_si128(X0, Y0); \ - X1 = _mm_xor_si128(X1, Y1); \ - X2 = _mm_xor_si128(X2, Y2); \ - X3 = _mm_xor_si128(X3, Y3); - -static uint32_t -blockmix_xor_save(const salsa20_blk_t *restrict Bin1, - salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, - size_t r, const __m128i *restrict S) -{ - const uint8_t * S0, * S1; - __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; - size_t i; - - if (!S) - return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r); - - S0 = (const uint8_t *)S; - S1 = (const uint8_t *)S + S_SIZE_ALL / 2; - - /* Convert 128-byte blocks to 64-byte blocks */ - r *= 2; - - r--; - PREFETCH(&Bin2[r], _MM_HINT_T0) - PREFETCH(&Bin1[r], _MM_HINT_T0) - for (i = 0; i < r; i++) { - PREFETCH(&Bin2[i], _MM_HINT_T0) - PREFETCH(&Bin1[i], _MM_HINT_T0) - PREFETCH_OUT(&Bout[i], _MM_HINT_T0) - } - PREFETCH_OUT(&Bout[r], _MM_HINT_T0); - - /* X <-- B_{r1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) - - /* for i = 0 to r1 - 1 do */ - for (i = 0; i < r; i++) { - XOR4(Bin1[i].q, Bin2[i].q) - /* X <-- H'(X \xor B_i) */ - XOR4_Y - PWXFORM - /* B'_i <-- X */ - XOUT(Bout[i].q) - } - - /* Last iteration of the loop above */ - XOR4(Bin1[i].q, Bin2[i].q) - XOR4_Y - PWXFORM - - /* B'_i <-- H(B'_i) */ - SALSA20_8(Bout[i].q) - - return _mm_cvtsi128_si32(X0); -} - -#undef ARX -#undef SALSA20_2ROUNDS -#undef SALSA20_8 -#undef SALSA20_8_XOR_ANY -#undef SALSA20_8_XOR_MEM -#undef SALSA20_8_XOR_REG -#undef PWXFORM_SIMD_1 -#undef PWXFORM_SIMD_2 -#undef PWXFORM_ROUND -#undef PWXFORM -#undef OUT -#undef XOR4 -#undef XOR4_2 -#undef XOR4_Y - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static inline uint32_t -integerify(const salsa20_blk_t * B, size_t r) -{ - return B[2 * r - 1].w[0]; -} - -/** - * smix1(B, r, N, flags, V, NROM, shared, XY, S): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 128r bytes in length. The value N must be even and no - * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and - * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 - * bytes as well saves cache lines, but might result in cache bank conflicts). - */ -static void -smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, - salsa20_blk_t * XY, void * S) -{ - const salsa20_blk_t * VROM = shared->shared1.aligned; - uint32_t VROM_mask = shared->mask1; - size_t s = 2 * r; - salsa20_blk_t * X = V, * Y; - uint32_t i, j; - size_t k; - - /* 1: X <-- B */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - if (NROM && (VROM_mask & 1)) { - uint32_t n; - salsa20_blk_t * V_n; - const salsa20_blk_t * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, S); - - X = &V[2 * s]; - if ((1 & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j = integerify(Y, r) & (NROM - 1); - V_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - j = blockmix_xor(Y, V_j, X, r, 1, S); - } else { - /* X <-- H(X) */ - blockmix(Y, X, r, S); - j = integerify(X, r); - } - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V_n[i * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((n + i) & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - } - - /* X <-- H(X \xor VROM_j) */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 1, S); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((N - 1) & VROM_mask) == 1) { - /* j <-- Integerify(X) mod NROM */ - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - } - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 1, S); - } else if (flags & YESCRYPT_RW) { - uint32_t n; - salsa20_blk_t * V_n, * V_j; - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[2 * s]; - blockmix(Y, X, r, S); - j = integerify(X, r); - - for (n = 2; n < N; n <<= 1) { - uint32_t m = (n < N / 2) ? n : (N - 1 - n); - - V_n = &V[n * s]; - - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < m; i += 2) { - Y = &V_n[i * s]; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i - 1; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += i; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V_n[(i + 1) * s]; - j = blockmix_xor(Y, V_j, X, r, 0, S); - } - } - - n >>= 1; - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 2 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[(N - 1) * s]; - j = blockmix_xor(X, V_j, Y, r, 0, S); - - /* j <-- Wrap(Integerify(X), i) */ - j &= n - 1; - j += N - 1 - n; - V_j = &V[j * s]; - - /* X <-- X \xor V_j */ - /* 4: X <-- H(X) */ - X = XY; - blockmix_xor(Y, V_j, X, r, 0, S); - } else { - /* 2: for i = 0 to N - 1 do */ - for (i = 1; i < N - 1; i += 2) { - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - X = &V[(i + 1) * s]; - blockmix(Y, X, r, S); - } - - /* 4: X <-- H(X) */ - /* 3: V_i <-- X */ - Y = &V[i * s]; - blockmix(X, Y, r, S); - - /* 4: X <-- H(X) */ - X = XY; - blockmix(Y, X, r, S); - } - - /* B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage XY must be 256r bytes in length. The value N must be a power of 2 - * greater than 1. The value Nloop must be even. The array V must be aligned - * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 - * bytes (aligning them to 64 bytes as well saves cache lines, but might result - * in cache bank conflicts). - */ -static void -smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, - yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, - const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S) -{ - const salsa20_blk_t * VROM = shared->shared1.aligned; - uint32_t VROM_mask = shared->mask1; - size_t s = 2 * r; - salsa20_blk_t * X = XY, * Y = &XY[s]; - uint64_t i; - uint32_t j; - size_t k; - - if (Nloop == 0) - return; - - /* X <-- B' */ - /* 3: V_i <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } - - i = Nloop / 2; - - /* 7: j <-- Integerify(X) mod N */ - j = integerify(X, r) & (N - 1); - -/* - * Normally, NROM implies YESCRYPT_RW, but we check for these separately - * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls - * operating on the entire V. - */ - if (NROM && (flags & YESCRYPT_RW)) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(X, V_j, Y, r, S); - - if (((i + 1) & VROM_mask) == 1) { - const salsa20_blk_t * VROM_j; - - j &= NROM - 1; - VROM_j = &VROM[j * s]; - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, VROM_j, X, r, 1, S); - } else { - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor_save(Y, V_j, X, r, S); - } - j &= N - 1; - V_j = &V[j * s]; - } - } else if (NROM) { - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < Nloop; i += 2) { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* j <-- Integerify(X) mod NROM */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - - if (((i + 1) & VROM_mask) == 1) { - j &= NROM - 1; - V_j = &VROM[j * s]; - } else { - j &= N - 1; - V_j = &V[j * s]; - } - - /* X <-- H(X \xor VROM_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 1, S); - j &= N - 1; - V_j = &V[j * s]; - } - } else if (flags & YESCRYPT_RW) { - /* 6: for i = 0 to N - 1 do */ - do { - salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(X, V_j, Y, r, S); - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* V_j <-- Xprev \xor V_j */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor_save(Y, V_j, X, r, S); - j &= N - 1; - } while (--i); - } else { - /* 6: for i = 0 to N - 1 do */ - do { - const salsa20_blk_t * V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(X, V_j, Y, r, 0, S); - j &= N - 1; - V_j = &V[j * s]; - - /* 8: X <-- H(X \xor V_j) */ - /* 7: j <-- Integerify(X) mod N */ - j = blockmix_xor(Y, V_j, X, r, 0, S); - j &= N - 1; - } while (--i); - } - - /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); - } - } -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint64_t -p2floor(uint64_t x) -{ - uint64_t y; - while ((y = x & (x - 1))) - x = y; - return x; -} - -/** - * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage XY - * must be 256r or 256rp bytes in length (the larger size is required with - * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. - * The array V must be aligned to a multiple of 64 bytes, and arrays B and - * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well - * saves cache lines and helps avoid false sharing in OpenMP-enabled builds - * when p > 1, but it might also result in cache bank conflicts). - */ -static void -smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, - yescrypt_flags_t flags, - salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, - salsa20_blk_t * XY, void * S) -{ - size_t s = 2 * r; - uint32_t Nchunk = N / p; - uint64_t Nloop_all, Nloop_rw; - uint32_t i; - - Nloop_all = Nchunk; - if (flags & YESCRYPT_RW) { - if (t <= 1) { - if (t) - Nloop_all *= 2; /* 2/3 */ - Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ - } else { - Nloop_all *= t - 1; - } - } else if (t) { - if (t == 1) - Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ - Nloop_all *= t; - } - - Nloop_rw = 0; - if (flags & __YESCRYPT_INIT_SHARED) - Nloop_rw = Nloop_all; - else if (flags & YESCRYPT_RW) - Nloop_rw = Nloop_all / p; - - Nchunk &= ~(uint32_t)1; /* round down to even */ - Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ - Nloop_rw &= ~(uint64_t)1; /* round down to even */ - -#ifdef _OPENMP -#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw) - { -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint32_t Vchunk = i * Nchunk; - uint8_t * Bp = &B[128 * r * i]; - salsa20_blk_t * Vp = &V[Vchunk * s]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); - void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; - if (Sp) - smix1(Bp, 1, S_SIZE_ALL / 128, - flags & ~YESCRYPT_PWXFORM, - Sp, NROM, shared, XYp, NULL); - if (!(flags & __YESCRYPT_INIT_SHARED_2)) - smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); - smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, - NROM, shared, XYp, Sp); - } - - if (Nloop_all > Nloop_rw) { -#ifdef _OPENMP -#pragma omp for -#endif - for (i = 0; i < p; i++) { - uint8_t * Bp = &B[128 * r * i]; -#ifdef _OPENMP - salsa20_blk_t * XYp = &XY[i * (2 * s)]; -#else - salsa20_blk_t * XYp = XY; -#endif - void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; - smix2(Bp, r, N, Nloop_all - Nloop_rw, - flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); - } - } -#ifdef _OPENMP - } -#endif -} - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters r, p, and buflen must satisfy - * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power - * of 2 greater than 1. (This optimized implementation currently additionally - * limits N to the range from 8 to 2^31, but other implementation might not.) - * - * t controls computation time while not affecting peak memory usage. shared - * and flags may request special modes as described in yescrypt.h. local is - * the thread-local data structure, allowing to preserve and reuse a memory - * allocation across calls, thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - */ -int -yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, - const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, - uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, - uint8_t * buf, size_t buflen, int thrid ) -{ - uint8_t _ALIGN(128) sha256[32]; - yescrypt_region_t tmp; - uint64_t NROM; - size_t B_size, V_size, XY_size, need; - uint8_t * B, * S; - salsa20_blk_t * V, * XY; - int retval = 1; - - /* - * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, - * so don't let it have side-effects. Without this adjustment, it'd - * enable the SHA-256 password pre-hashing and output post-hashing, - * because any deviation from classic scrypt implies those. - */ - if (p == 1) - flags &= ~YESCRYPT_PARALLEL_SMIX; - - /* Sanity-check parameters */ - if (flags & ~YESCRYPT_KNOWN_FLAGS) { - errno = EINVAL; - return -1; - } -#if SIZE_MAX > UINT32_MAX - if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { - errno = EFBIG; - return -1; - } -#endif - if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { - errno = EFBIG; - return -1; - } - if (N > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) { - errno = EINVAL; - return -1; - } - if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) { - errno = EINVAL; - return -1; - } - if ((r > SIZE_MAX / 256 / p) || - (N > SIZE_MAX / 128 / r)) { - errno = ENOMEM; - return -1; - } -#ifdef _OPENMP - if (!(flags & YESCRYPT_PARALLEL_SMIX) && - (N > SIZE_MAX / 128 / (r * p))) { - errno = ENOMEM; - return -1; - } -#endif - if ((flags & YESCRYPT_PWXFORM) && -#ifndef _OPENMP - (flags & YESCRYPT_PARALLEL_SMIX) && -#endif - p > SIZE_MAX / S_SIZE_ALL) { - errno = ENOMEM; - return -1; - } - - NROM = 0; - if (shared->shared1.aligned) { - NROM = shared->shared1.aligned_size / ((size_t)128 * r); - if (NROM > UINT32_MAX) { - errno = EFBIG; - return -1; - } - if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) || - !(flags & YESCRYPT_RW)) { - errno = EINVAL; - return -1; - } - } - - /* Allocate memory */ - V = NULL; - V_size = (size_t)128 * r * N; -#ifdef _OPENMP - if (!(flags & YESCRYPT_PARALLEL_SMIX)) - V_size *= p; -#endif - need = V_size; - if (flags & __YESCRYPT_INIT_SHARED) { - if (local->aligned_size < need) { - if (local->base || local->aligned || - local->base_size || local->aligned_size) { - errno = EINVAL; - return -1; - } - if (!alloc_region(local, need)) - return -1; - } - V = (salsa20_blk_t *)local->aligned; - need = 0; - } - B_size = (size_t)128 * r * p; - need += B_size; - if (need < B_size) { - errno = ENOMEM; - return -1; - } - XY_size = (size_t)256 * r; -#ifdef _OPENMP - XY_size *= p; -#endif - need += XY_size; - if (need < XY_size) { - errno = ENOMEM; - return -1; - } - if (flags & YESCRYPT_PWXFORM) { - size_t S_size = S_SIZE_ALL; -#ifdef _OPENMP - S_size *= p; -#else - if (flags & YESCRYPT_PARALLEL_SMIX) - S_size *= p; -#endif - need += S_size; - if (need < S_size) { - errno = ENOMEM; - return -1; - } - } - if (flags & __YESCRYPT_INIT_SHARED) { - if (!alloc_region(&tmp, need)) - return -1; - B = (uint8_t *)tmp.aligned; - XY = (salsa20_blk_t *)((uint8_t *)B + B_size); - } else { - init_region(&tmp); - if (local->aligned_size < need) { - if (free_region(local)) - return -1; - if (!alloc_region(local, need)) - return -1; - } - B = (uint8_t *)local->aligned; - V = (salsa20_blk_t *)((uint8_t *)B + B_size); - XY = (salsa20_blk_t *)((uint8_t *)V + V_size); - } - S = NULL; - if (flags & YESCRYPT_PWXFORM) - S = (uint8_t *)XY + XY_size; - - if (t || flags) { - SHA256_Buf( passwd, passwdlen, sha256 ); - passwd = sha256; - passwdlen = sizeof(sha256); - } - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size); - - if ( work_restart[thrid].restart ) - { - retval = 0; - goto out; - } - - if (t || flags) - memcpy(sha256, B, sizeof(sha256)); - - if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { - smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); - } else { - uint32_t i; - - /* 2: for i = 0 to p - 1 do */ -#ifdef _OPENMP -#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S) -#endif - for (i = 0; i < p; i++) { - /* 3: B_i <-- MF(B_i, N) */ -#ifdef _OPENMP - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, - &V[(size_t)2 * r * i * N], - NROM, shared, - &XY[(size_t)4 * r * i], - S ? &S[S_SIZE_ALL * i] : S); -#else - smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, - NROM, shared, XY, S); -#endif - } - } - - if ( work_restart[thrid].restart ) - { - retval = 0; - goto out; - } - - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen); - - if ( work_restart[thrid].restart ) - { - retval = 0; - goto out; - } - - /* - * Except when computing classic scrypt, allow all computation so far - * to be performed on the client. The final steps below match those of - * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so - * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of - * SCRAM's use of SHA-1) would be usable with yescrypt hashes. - */ - if ((t || flags) && buflen == sizeof(sha256)) { - /* Compute ClientKey */ - { - HMAC_SHA256_CTX ctx; - HMAC_SHA256_Init(&ctx, buf, buflen); - if ( yescrypt_client_key ) - HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key, - yescrypt_client_key_len ); - else - HMAC_SHA256_Update( &ctx, salt, saltlen ); - HMAC_SHA256_Final(sha256, &ctx); - } - /* Compute StoredKey */ - { - SHA256_Buf( sha256, sizeof(sha256), buf ); - } - } - -out: - if (free_region(&tmp)) - return -1; - - /* Success! */ - return retval; -} diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c deleted file mode 100644 index 7d3efa1a..00000000 --- a/algo/yescrypt/yescrypt.c +++ /dev/null @@ -1,488 +0,0 @@ -/*- - * Copyright 2013,2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include -#include - -#include "compat.h" - -#include "yescrypt.h" -#include "algo/sha/hmac-sha256-hash.h" -#include "algo-gate-api.h" - -#define BYTES2CHARS(bytes) \ - ((((bytes) * 8) + 5) / 6) - -#define HASH_SIZE 32 /* bytes */ -#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ -#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM) - -static const char * const itoa64 = - "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits) -{ - uint32_t bit; - - for (bit = 0; bit < srcbits; bit += 6) { - if (dstlen < 1) - return NULL; - *dst++ = itoa64[src & 0x3f]; - dstlen--; - src >>= 6; - } - - return dst; -} - -static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen) -{ - size_t i; - - for (i = 0; i < srclen; ) { - uint8_t * dnext; - uint32_t value = 0, bits = 0; - do { - value |= (uint32_t)src[i++] << bits; - bits += 8; - } while (bits < 24 && i < srclen); - dnext = encode64_uint32(dst, dstlen, value, bits); - if (!dnext) - return NULL; - dstlen -= dnext - dst; - dst = dnext; - } - - return dst; -} - -static int decode64_one(uint32_t* dst, uint8_t src) -{ - const char * ptr = strchr(itoa64, src); - if (ptr) { - *dst = (uint32_t) (ptr - itoa64); - return 0; - } - *dst = 0; - return -1; -} - -static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src) -{ - uint32_t bit; - uint32_t value; - - value = 0; - for (bit = 0; bit < dstbits; bit += 6) { - uint32_t one; - if (decode64_one(&one, *src)) { - *dst = 0; - return NULL; - } - src++; - value |= one << bit; - } - - *dst = value; - return src; -} - -uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local, - const uint8_t* passwd, size_t passwdlen, const uint8_t* setting, - uint8_t* buf, size_t buflen, int thrid ) -{ - uint8_t hash[HASH_SIZE]; - const uint8_t * src, * salt; - uint8_t * dst; - size_t prefixlen, saltlen, need; - uint8_t version; - uint64_t N; - uint32_t r, p; - yescrypt_flags_t flags = YESCRYPT_WORM; - - printf("pass1 ..."); - fflush(stdout); - - if (setting[0] != '$' || setting[1] != '7') { - printf("died$7 ..."); - fflush(stdout); - return NULL; - } - - printf("died80 ..."); - fflush(stdout); - - src = setting + 2; - - printf("hello '%p'\n", (char *)src); - fflush(stdout); - - switch ((version = *src)) { - case '$': - printf("died2 ..."); - fflush(stdout); - break; - case 'X': - src++; - flags = YESCRYPT_RW; - printf("died3 ..."); - fflush(stdout); - break; - default: - printf("died4 ..."); - fflush(stdout); - return NULL; - } - - printf("pass2 ..."); - fflush(stdout); - - if (*src != '$') { - uint32_t decoded_flags; - if (decode64_one(&decoded_flags, *src)) { - printf("died5 ..."); - fflush(stdout); - return NULL; - } - flags = decoded_flags; - if (*++src != '$') { - printf("died6 ..."); - fflush(stdout); - return NULL; - } - } - - src++; - - { - uint32_t N_log2; - if (decode64_one(&N_log2, *src)) { - printf("died7 ..."); - return NULL; - } - src++; - N = (uint64_t)1 << N_log2; - } - - src = decode64_uint32(&r, 30, src); - if (!src) { - printf("died6 ..."); - return NULL; - } - - src = decode64_uint32(&p, 30, src); - if (!src) { - printf("died7 ..."); - return NULL; - } - - prefixlen = src - setting; - - salt = src; - src = (uint8_t *)strrchr((char *)salt, '$'); - if (src) - saltlen = src - salt; - else - saltlen = strlen((char *)salt); - - need = prefixlen + saltlen + 1 + HASH_LEN + 1; - if (need > buflen || need < saltlen) { - printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen); - printf("died8killbuf ..."); - fflush(stdout); - return NULL; - } - - if ( yescrypt_kdf( shared, local, passwd, passwdlen, salt, saltlen, N, r, p, - 0, flags, hash, sizeof(hash), thrid ) == -1 ) - { - printf("died10 ..."); - fflush(stdout); - return NULL; - } - - dst = buf; - memcpy(dst, setting, prefixlen + saltlen); - dst += prefixlen + saltlen; - *dst++ = '$'; - - dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash)); - /* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its - * memory allocations yet anyway. */ - if (!dst || dst >= buf + buflen) { /* Can't happen */ - printf("died11 ..."); - return NULL; - } - - *dst = 0; /* NUL termination */ - - printf("died12 ..."); - fflush(stdout); - - return buf; -} - -uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting, int thrid ) -{ - static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1]; - yescrypt_shared_t shared; - yescrypt_local_t local; - uint8_t * retval; - - if (yescrypt_init_shared(&shared, NULL, 0, - 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) - return NULL; - if (yescrypt_init_local(&local)) { - yescrypt_free_shared(&shared); - return NULL; - } - retval = yescrypt_r(&shared, &local, - passwd, 80, setting, buf, sizeof(buf), thrid ); - //printf("hashse='%s'\n", (char *)retval); - if (yescrypt_free_local(&local)) { - yescrypt_free_shared(&shared); - return NULL; - } - if (yescrypt_free_shared(&shared)) - return NULL; - return retval; -} - -uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags, - const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen) -{ - uint8_t * dst; - size_t prefixlen = 3 + 1 + 5 + 5; - size_t saltlen = BYTES2CHARS(srclen); - size_t need; - - if (p == 1) - flags &= ~YESCRYPT_PARALLEL_SMIX; - - if (flags) { - if (flags & ~0x3f) - return NULL; - - prefixlen++; - if (flags != YESCRYPT_RW) - prefixlen++; - } - - need = prefixlen + saltlen + 1; - if (need > buflen || need < saltlen || saltlen < srclen) - return NULL; - - if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30))) - return NULL; - - dst = buf; - *dst++ = '$'; - *dst++ = '7'; - if (flags) { - *dst++ = 'X'; /* eXperimental, subject to change */ - if (flags != YESCRYPT_RW) - *dst++ = itoa64[flags]; - } - *dst++ = '$'; - - *dst++ = itoa64[N_log2]; - - dst = encode64_uint32(dst, buflen - (dst - buf), r, 30); - if (!dst) /* Can't happen */ - return NULL; - - dst = encode64_uint32(dst, buflen - (dst - buf), p, 30); - if (!dst) /* Can't happen */ - return NULL; - - dst = encode64(dst, buflen - (dst - buf), src, srclen); - if (!dst || dst >= buf + buflen) /* Can't happen */ - return NULL; - - *dst = 0; /* NUL termination */ - - return buf; -} - -uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags, - const uint8_t * src, size_t srclen) -{ - static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1]; - return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, - buf, sizeof(buf)); -} - -static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen, - const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, - uint8_t * buf, size_t buflen, int thrid ) -{ - static __thread int initialized = 0; - static __thread yescrypt_shared_t shared; - static __thread yescrypt_local_t local; - int retval; - if (!initialized) { -/* "shared" could in fact be shared, but it's simpler to keep it private - * along with "local". It's dummy and tiny anyway. */ - if (yescrypt_init_shared(&shared, NULL, 0, - 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) - return -1; - if (yescrypt_init_local(&local)) { - yescrypt_free_shared(&shared); - return -1; - } - initialized = 1; - } - retval = yescrypt_kdf(&shared, &local, - passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS, - buf, buflen, thrid ); -#if 0 - if (yescrypt_free_local(&local)) { - yescrypt_free_shared(&shared); - return -1; - } - if (yescrypt_free_shared(&shared)) - return -1; - initialized = 0; -#endif - return retval; -} - -// scrypt parameters initialized at run time. -uint64_t YESCRYPT_N; -uint32_t YESCRYPT_R; -uint32_t YESCRYPT_P; -char *yescrypt_client_key = NULL; -int yescrypt_client_key_len = 0; - -/* main hash 80 bytes input */ -int yescrypt_hash( const char *input, char *output, uint32_t len, int thrid ) -{ - return yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N, - YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32, thrid ); -} - -/* for util.c test */ -int yescrypthash(void *output, const void *input, int thrid) -{ - return yescrypt_hash((char*) input, (char*) output, 80, thrid); -} - -int scanhash_yescrypt( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) vhash[8]; - uint32_t _ALIGN(64) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce; - uint32_t n = first_nonce; - int thr_id = mythr->id; - - for ( int k = 0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); - endiandata[19] = n; - do { - if ( yescrypt_hash((char*) endiandata, (char*) vhash, 80, thr_id ) ) - if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark ) - { - be32enc( pdata+19, n ); - submit_solution( work, vhash, mythr ); - } - endiandata[19] = ++n; - } while ( n < last_nonce && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce; - pdata[19] = n; - return 0; -} - -void yescrypt_gate_base(algo_gate_t *gate ) -{ - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yescrypt; - gate->hash = (void*)&yescrypt_hash; - opt_target_factor = 65536.0; -} - -bool register_yescrypt_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - - if ( opt_param_n ) YESCRYPT_N = opt_param_n; - else YESCRYPT_N = 2048; - - if ( opt_param_r ) YESCRYPT_R = opt_param_r; - else YESCRYPT_R = 8; - - if ( opt_param_key ) - { - yescrypt_client_key = opt_param_key; - yescrypt_client_key_len = strlen( opt_param_key ); - } - else - { - yescrypt_client_key = NULL; - yescrypt_client_key_len = 0; - } - - YESCRYPT_P = 1; - - applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d", YESCRYPT_N, - YESCRYPT_R ); - if ( yescrypt_client_key ) - applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key ); - - return true; -} - -bool register_yescryptr8_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - yescrypt_client_key = "Client Key"; - yescrypt_client_key_len = 10; - YESCRYPT_N = 2048; - YESCRYPT_R = 8; - YESCRYPT_P = 1; - return true; -} - -bool register_yescryptr16_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - yescrypt_client_key = "Client Key"; - yescrypt_client_key_len = 10; - YESCRYPT_N = 4096; - YESCRYPT_R = 16; - YESCRYPT_P = 1; - return true; -} - -bool register_yescryptr32_algo( algo_gate_t* gate ) -{ - yescrypt_gate_base( gate ); - yescrypt_client_key = "WaviBanana"; - yescrypt_client_key_len = 10; - YESCRYPT_N = 4096; - YESCRYPT_R = 32; - YESCRYPT_P = 1; - return true; -} - diff --git a/algo/yescrypt/yescrypt.h b/algo/yescrypt/yescrypt.h deleted file mode 100644 index 51be262e..00000000 --- a/algo/yescrypt/yescrypt.h +++ /dev/null @@ -1,382 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013,2014 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#ifndef YESCRYPT_H -#define YESCRYPT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include /* for size_t */ -#include -#include "miner.h" - -//#define __SSE4_1__ - -int yescrypt_hash(const char* input, char* output, uint32_t len, int thrid ); - -int yescrypthash(void *output, const void *input, int thrid ); - -/** - * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen) and write the result into buf. The parameters r, p, and buflen - * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N - * must be a power of 2 greater than 1. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as buf is local to the thread. - */ -extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __salt, size_t __saltlen, - uint64_t __N, uint32_t __r, uint32_t __p, - uint8_t * __buf, size_t __buflen); - -/** - * Internal type used by the memory allocator. Please do not use it directly. - * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since - * they might differ from each other in a future version. - */ -typedef struct { - void * base, * aligned; - size_t base_size, aligned_size; -} yescrypt_region_t; - -/** - * Types for shared (ROM) and thread-local (RAM) data structures. - */ -typedef yescrypt_region_t yescrypt_shared1_t; -typedef struct { - yescrypt_shared1_t shared1; - uint32_t mask1; -} yescrypt_shared_t; -typedef yescrypt_region_t yescrypt_local_t; - -/** - * Possible values for yescrypt_init_shared()'s flags argument. - */ -typedef enum { - YESCRYPT_SHARED_DEFAULTS = 0, - YESCRYPT_SHARED_PREALLOCATED = 0x100 -} yescrypt_init_shared_flags_t; - -/** - * Possible values for the flags argument of yescrypt_kdf(), - * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, - * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. - * Please refer to the description of yescrypt_kdf() below for the meaning of - * these flags. - */ -typedef enum { -/* public */ - YESCRYPT_WORM = 0, - YESCRYPT_RW = 1, - YESCRYPT_PARALLEL_SMIX = 2, - YESCRYPT_PWXFORM = 4, -/* private */ - __YESCRYPT_INIT_SHARED_1 = 0x10000, - __YESCRYPT_INIT_SHARED_2 = 0x20000, - __YESCRYPT_INIT_SHARED = 0x30000 -} yescrypt_flags_t; - -extern char *yescrypt_client_key; -extern int yescrypt_client_key_len; - - -#define YESCRYPT_KNOWN_FLAGS \ - (YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \ - __YESCRYPT_INIT_SHARED) - -/** - * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask, - * buf, buflen): - * Optionally allocate memory for and initialize the shared (ROM) data - * structure. The parameters N, r, and p must satisfy the same conditions as - * with crypto_scrypt(). param and paramlen specify a local parameter with - * which the ROM is seeded. If buf is not NULL, then it is used to return - * buflen bytes of message digest for the initialized ROM (the caller may use - * this to verify that the ROM has been computed in the same way that it was on - * a previous run). - * - * Return 0 on success; or -1 on error. - * - * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the - * ROM is assumed to have been preallocated by the caller, with - * shared->shared1.aligned being the start address of the ROM and - * shared->shared1.aligned_size being its size (which must be consistent with - * N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV - * shared memory segment allocated by the caller. - * - * mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it - * should be set to 1, to interleave RAM and ROM accesses, which works well - * when both regions reside in the machine's RAM anyway. Other values may be - * used e.g. when the ROM is memory-mapped from a disk file. Recommended mask - * values are powers of 2 minus 1 or minus 2. Here's the effect of some mask - * values: - * mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop - * 0 0 1/2 - * 1 1/2 1/2 - * 2 0 1/4 - * 3 1/4 1/4 - * 6 0 1/8 - * 7 1/8 1/8 - * 14 0 1/16 - * 15 1/16 1/16 - * 1022 0 1/1024 - * 1023 1/1024 1/1024 - * - * Actual computation of the ROM contents may be avoided, if you don't intend - * to use a ROM but need a dummy shared structure, by calling this function - * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the - * arguments starting with param and on. - * - * MT-safe as long as shared is local to the thread. - */ -extern int yescrypt_init_shared(yescrypt_shared_t * __shared, - const uint8_t * __param, size_t __paramlen, - uint64_t __N, uint32_t __r, uint32_t __p, - yescrypt_init_shared_flags_t __flags, uint32_t __mask, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_free_shared(shared): - * Free memory that had been allocated with yescrypt_init_shared(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as shared is local to the thread. - */ -extern int yescrypt_free_shared(yescrypt_shared_t * __shared); - -/** - * yescrypt_init_local(local): - * Initialize the thread-local (RAM) data structure. Actual memory allocation - * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yescrypt_init_local(yescrypt_local_t * __local); - -/** - * yescrypt_free_local(local): - * Free memory that may have been allocated for an initialized thread-local - * (RAM) data structure. - * - * Return 0 on success; or -1 on error. - * - * MT-safe as long as local is local to the thread. - */ -extern int yescrypt_free_local(yescrypt_local_t * __local); - -/** - * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, - * N, r, p, t, flags, buf, buflen): - * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, - * p, buflen), or a revision of scrypt as requested by flags and shared, and - * write the result into buf. The parameters N, r, p, and buflen must satisfy - * the same conditions as with crypto_scrypt(). t controls computation time - * while not affecting peak memory usage. shared and flags may request - * special modes as described below. local is the thread-local data - * structure, allowing to preserve and reuse a memory allocation across calls, - * thereby reducing its overhead. - * - * Return 0 on success; or -1 on error. - * - * t controls computation time. t = 0 is optimal in terms of achieving the - * highest area-time for ASIC attackers. Thus, higher computation time, if - * affordable, is best achieved by increasing N rather than by increasing t. - * However, if the higher memory usage (which goes along with higher N) is not - * affordable, or if fine-tuning of the time is needed (recall that N must be a - * power of 2), then t = 1 or above may be used to increase time while staying - * at the same peak memory usage. t = 1 increases the time by 25% and - * decreases the normalized area-time to 96% of optimal. (Of course, in - * absolute terms the area-time increases with higher t. It's just that it - * would increase slightly more with higher N*r rather than with higher t.) - * t = 2 increases the time by another 20% and decreases the normalized - * area-time to 89% of optimal. Thus, these two values are reasonable to use - * for fine-tuning. Values of t higher than 2 result in further increase in - * time while reducing the efficiency much further (e.g., down to around 50% of - * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact - * numbers varying by the flags settings). - * - * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and - * passing a dummy shared structure (see the description of - * yescrypt_init_shared() above for how to produce one). In this mode, the - * thread-local memory region (RAM) is first sequentially written to and then - * randomly read from. This algorithm is friendly towards time-memory - * tradeoffs (TMTO), available both to defenders (albeit not in this - * implementation) and to attackers. - * - * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local - * memory region (RAM), which makes TMTO a lot less efficient. This may be - * used to slow down the kinds of attackers who would otherwise benefit from - * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not - * only for the tradeoff, but also for a decrease of attacker's area-time (by - * up to a constant factor), setting YESCRYPT_RW substantially increases the - * cost of attacks in area-time terms as well. Yet another benefit of it is - * that optimal area-time is reached at an earlier time than with classic - * scrypt, and t = 0 actually corresponds to this earlier completion time, - * resulting in quicker hash computations (and thus in higher request rate - * capacity). Due to these properties, YESCRYPT_RW should almost always be - * set, except when compatibility with classic scrypt or TMTO-friendliness are - * desired. - * - * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a - * lower level as compared to where it is in classic scrypt. This reduces - * flexibility for efficient computation (for both attackers and defenders) by - * requiring that, short of resorting to TMTO, the full amount of memory be - * allocated as needed for the specified p, regardless of whether that - * parallelism is actually being fully made use of or not. (For comparison, a - * single instance of classic scrypt may be computed in less memory without any - * CPU time overhead, but in more real time, by not making full use of the - * parallelism.) This may be desirable when the defender has enough memory - * with sufficiently low latency and high bandwidth for efficient full parallel - * execution, yet the required memory size is high enough that some likely - * attackers might end up being forced to choose between using higher latency - * memory than they could use otherwise (waiting for data longer) or using TMTO - * (waiting for data more times per one hash computation). The area-time cost - * for other kinds of attackers (who would use the same memory type and TMTO - * factor or no TMTO either way) remains roughly the same, given the same - * running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as - * long as the defender has enough memory that is just as fast as the smaller - * per-thread regions would be, doesn't expect to ever need greater - * flexibility (except possibly via TMTO), and doesn't need backwards - * compatibility with classic scrypt, there are no other serious drawbacks to - * this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO, - * this new approach to parallelization makes TMTO less inefficient. (This is - * an unfortunate side-effect of avoiding some random writes, as we have to in - * order to allow for parallel threads to access a common memory region without - * synchronization overhead.) Thus, in this mode this setting poses an extra - * tradeoff of its own (higher area-time cost for a subset of attackers vs. - * better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the - * way the running time is to be controlled from N*r*p (for classic scrypt) to - * N*r (in this modification). All of this applies only when p > 1. For - * p = 1, this setting is a no-op. - * - * Passing a real shared structure, with ROM contents previously computed by - * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for - * the thread-local RAM region. In order to allow for initialization of the - * ROM to be split into a separate program, the shared->shared1.aligned and - * shared->shared1.aligned_size fields may be set by the caller of - * yescrypt_kdf() manually rather than with yescrypt_init_shared(). - * - * local must be initialized with yescrypt_init_local(). - * - * MT-safe as long as local and buf are local to the thread. - */ -extern int yescrypt_kdf(const yescrypt_shared_t * __shared, - yescrypt_local_t * __local, - const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __salt, size_t __saltlen, - uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, - yescrypt_flags_t __flags, - uint8_t * __buf, size_t __buflen, int thrid); - -/** - * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): - * Compute and encode an scrypt or enhanced scrypt hash of passwd given the - * parameters and salt value encoded in setting. If the shared structure is - * not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to - * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff - * discouraging modification) is determined by the setting string. shared and - * local must be initialized as described above for yescrypt_kdf(). buf must - * be large enough (as indicated by buflen) to hold the encoded hash string. - * - * Return the encoded hash string on success; or NULL on error. - * - * MT-safe as long as local and buf are local to the thread. - */ -extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, - yescrypt_local_t * __local, - const uint8_t * __passwd, size_t __passwdlen, - const uint8_t * __setting, - uint8_t * __buf, size_t __buflen, int thrid); - -/** - * yescrypt(passwd, setting): - * Compute and encode an scrypt or enhanced scrypt hash of passwd given the - * parameters and salt value encoded in setting. Whether to use the - * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff - * discouraging modification) is determined by the setting string. - * - * Return the encoded hash string on success; or NULL on error. - * - * This is a crypt(3)-like interface, which is simpler to use than - * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, - * and it is slower than yescrypt_r() for repeated calls because it allocates - * and frees memory on each call. - * - * MT-unsafe. - */ -extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting, int thrid ); - -/** - * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): - * Generate a setting string for use with yescrypt_r() and yescrypt() by - * encoding into it the parameters N_log2 (which is to be set to base 2 - * logarithm of the desired value for N), r, p, flags, and a salt given by src - * (of srclen bytes). buf must be large enough (as indicated by buflen) to - * hold the setting string. - * - * Return the setting string on success; or NULL on error. - * - * MT-safe as long as buf is local to the thread. - */ -extern uint8_t * yescrypt_gensalt_r( - uint32_t __N_log2, uint32_t __r, uint32_t __p, - yescrypt_flags_t __flags, - const uint8_t * __src, size_t __srclen, - uint8_t * __buf, size_t __buflen); - -/** - * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): - * Generate a setting string for use with yescrypt_r() and yescrypt(). This - * function is the same as yescrypt_gensalt_r() except that it uses a static - * buffer and thus is not MT-safe. - * - * Return the setting string on success; or NULL on error. - * - * MT-unsafe. - */ -extern uint8_t * yescrypt_gensalt( - uint32_t __N_log2, uint32_t __r, uint32_t __p, - yescrypt_flags_t __flags, - const uint8_t * __src, size_t __srclen); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index a52dea2c..54d119e2 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -161,7 +161,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate ) // Legacy Yescrypt (yespower v0.5) -bool register_yescrypt_05_algo( algo_gate_t* gate ) +bool register_yescrypt_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; @@ -194,7 +194,7 @@ bool register_yescrypt_05_algo( algo_gate_t* gate ) } -bool register_yescryptr8_05_algo( algo_gate_t* gate ) +bool register_yescryptr8_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; @@ -207,7 +207,7 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate ) return true; } -bool register_yescryptr16_05_algo( algo_gate_t* gate ) +bool register_yescryptr16_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; @@ -220,7 +220,7 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate ) return true; } -bool register_yescryptr32_05_algo( algo_gate_t* gate ) +bool register_yescryptr32_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; diff --git a/configure b/configure index 93e54874..28c974d2 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.20.1' -PACKAGE_STRING='cpuminer-opt 3.20.1' +PACKAGE_VERSION='3.20.2' +PACKAGE_STRING='cpuminer-opt 3.20.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.20.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.20.2:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.20.1 +cpuminer-opt configure 3.20.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.20.1, which was +It was created by cpuminer-opt $as_me 3.20.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.20.1' + VERSION='3.20.2' cat >>confdefs.h <<_ACEOF @@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.20.1, which was +This file was extended by cpuminer-opt $as_me 3.20.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6784,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.20.1 +cpuminer-opt config.status 3.20.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index d0835055..637fb4bc 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.20.1]) +AC_INIT([cpuminer-opt], [3.20.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 2a52eb2f..bca99ed2 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #endif // Mask making - // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask. // Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements. +// Effectively a sign test. #define mm_movmask_64( v ) \ _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) ) @@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // // Bit rotations -// AVX512VL has implemented bit rotation for 128 bit vectors with -// 64 and 32 bit elements. - // x2 rotates elements in 2 individual vectors in a double buffered // optimization for SSE2, does nothing for AVX512 but is there for // transparency. -// compiler doesn't like when a variable is used for the last arg of -// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same -// specification but works with a variable. Therefore use rol_var where -// necessary. -// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required. - -#define mm128_ror_var_64( v, c ) \ - _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ) - -#define mm128_rol_var_64( v, c ) \ - _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ) - -#define mm128_ror_var_32( v, c ) \ - _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ) - -#define mm128_rol_var_32( v, c ) \ - _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) - - #if defined(__AVX512VL__) -//#if defined(__AVX512F__) && defined(__AVX512VL__) #define mm128_ror_64 _mm_ror_epi64 #define mm128_rol_64 _mm_rol_epi64 @@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #else // SSE2 -#define mm128_ror_64 mm128_ror_var_64 -#define mm128_rol_64 mm128_rol_var_64 -#define mm128_ror_32 mm128_ror_var_32 -#define mm128_rol_32 mm128_rol_var_32 +#define mm128_ror_64( v, c ) \ + _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ) + +#define mm128_rol_64( v, c ) \ + _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ) + +#define mm128_ror_32( v, c ) \ + _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ) + +#define mm128_rol_32( v, c ) \ + _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) #define mm128_rorx2_64( v1, v0, c ) \ { \ @@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_16( v, c ) \ _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ) +// Deprecated. +#define mm128_rol_var_32( v, c ) \ + _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) + +// // Limited 2 input shuffle, combines shuffle with blend. The destination low // half is always taken from src a, and the high half from src b. #define mm128_shuffle2_64( a, b, c ) \ @@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \ _mm_castsi128_ps( b ), c ) ); - // // Rotate vector elements accross all lanes @@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 ) +#if defined(__SSSE3__) + +// Rotate right by c bytes, no SSE2 equivalent. +static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) +{ return _mm_alignr_epi8( v, v, c ); } + +#endif + +// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations +// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available +// (unlikely but faster), or when SSSE3 is not available (slower). -// Swap 32 bit elements in 64 bit lanes #define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) #define mm128_shuflr64_32 mm128_swap64_32 #define mm128_shufll64_32 mm128_swap64_32 -#if defined(__SSSE3__) +#if defined(__SSSE3__) && !defined(__AVX512VL__) + #define mm128_shuflr64_24( v ) \ + _mm_shuffle_epi8( v, _mm_set_epi64x( \ + 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) +#else + #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 ) +#endif -// Rotate right by c bytes, no SSE2 equivalent. -static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) -{ return _mm_alignr_epi8( v, v, c ); } +#if defined(__SSSE3__) && !defined(__AVX512VL__) + #define mm128_shuflr64_16( v ) \ + _mm_shuffle_epi8( v, _mm_set_epi64x( \ + 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) +#else + #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 ) +#endif + +#if defined(__SSSE3__) && !defined(__AVX512VL__) + #define mm128_swap32_16( v ) \ + _mm_shuffle_epi8( v, _mm_set_epi64x( \ + 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) +#else + #define mm128_swap32_16( v ) mm128_ror_32( v, 16 ) +#endif +#define mm128_shuflr32_16 mm128_swap32_16 +#define mm128_shufll32_16 mm128_swap32_16 + +#if defined(__SSSE3__) && !defined(__AVX512VL__) + #define mm128_shuflr32_8( v ) \ + _mm_shuffle_epi8( v, _mm_set_epi64x( \ + 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) +#else + #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 ) +#endif // // Endian byte swap. +#if defined(__SSSE3__) + #define mm128_bswap_64( v ) \ _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \ 0x0001020304050607 ) ) @@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) // // Rotate in place concatenated 128 bit vectors as one 256 bit vector. -// Swap 128 bit vectorse. - +// Swap 128 bit vectors. +// This should be avoided, it's more efficient to switch references. #define mm128_swap256_128( v1, v2 ) \ v1 = _mm_xor_si128( v1, v2 ); \ v2 = _mm_xor_si128( v1, v2 ); \ @@ -552,8 +580,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) // Function macros with two inputs and one output, inputs are preserved. // Returns the high 128 bits, ie updated v1. -// These two-input functions are not available without SSSE3. Use procedure -// macros below instead. +// These functions are preferred but only available with SSSE3. Use procedure +// macros below for SSE2 compatibility. #define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 ) #define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) @@ -568,8 +596,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) #define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) // Procedure macros with 2 inputs and 2 outputs, input args are overwritten. -// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility -// with existing code. The function macros above can be used more effciently. +// Deprecated for SSSE3 and above, SSSE3 versions exist for only for +// compatibility with with existing code. #define mm128_vror256_64( v1, v2 ) \ do { \ diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 025436f6..2e9423e5 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -13,6 +13,18 @@ // AVX512 implementations. They will be selected automatically but their use // is limited because 256 bit vectors are less likely to be used when 512 // is available. +// +// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512 +// version is not. Some usage has the index vector encoded as if full vector +// shuffles are supported. This has no side effects and would have the same +// results using either version. +// If needed and AVX512 is available, 256 bit full vector shuffles can be +// implemented using the AVX512 zero-mask feature with a NULL mask. +// Using intrinsics it's simple: +// _mm256_maskz_shuffle_epi8( k0, v, c ) +// With asm it's a bit more complicated with the addition of the mask register +// and zero tag: +// vpshufb ymm0{k0}{z}, ymm1, ymm2 #if defined(__AVX__) @@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #endif // Mask making - // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask. // Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements. +// Effectively a sign test. #define mm256_movmask_64( v ) \ _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) ) @@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // // Bit rotations. // -// The only bit shift for more than 64 bits is with __int128 which is slow. -// -// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements -// // x2 rotates elements in 2 individual vectors in a double buffered -// optimization for SSE2, does nothing for AVX512 but is there for +// optimization for AVX2, does nothing for AVX512 but is here for // transparency. - -// compiler doesn't like when a variable is used for the last arg of -// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where -// necessary. - -#define mm256_ror_var_64( v, c ) \ - _mm256_or_si256( _mm256_srli_epi64( v, c ), \ - _mm256_slli_epi64( v, 64-(c) ) ) - -#define mm256_rol_var_64( v, c ) \ - _mm256_or_si256( _mm256_slli_epi64( v, c ), \ - _mm256_srli_epi64( v, 64-(c) ) ) - -#define mm256_ror_var_32( v, c ) \ - _mm256_or_si256( _mm256_srli_epi32( v, c ), \ - _mm256_slli_epi32( v, 32-(c) ) ) - -#define mm256_rol_var_32( v, c ) \ - _mm256_or_si256( _mm256_slli_epi32( v, c ), \ - _mm256_srli_epi32( v, 32-(c) ) ) - - -// The spec says both F & VL are required, but just in case AMD -// decides to implement ROL/R without AVX512F. #if defined(__AVX512VL__) -//#if defined(__AVX512F__) && defined(__AVX512VL__) - -// AVX512, control must be 8 bit immediate. #define mm256_ror_64 _mm256_ror_epi64 #define mm256_rol_64 _mm256_rol_epi64 @@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) #else // AVX2 -#define mm256_ror_64 mm256_ror_var_64 -#define mm256_rol_64 mm256_rol_var_64 -#define mm256_ror_32 mm256_ror_var_32 -#define mm256_rol_32 mm256_rol_var_32 +// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8. + +#define mm256_ror_64( v, c ) \ + _mm256_or_si256( _mm256_srli_epi64( v, c ), \ + _mm256_slli_epi64( v, 64-(c) ) ) + +#define mm256_rol_64( v, c ) \ + _mm256_or_si256( _mm256_slli_epi64( v, c ), \ + _mm256_srli_epi64( v, 64-(c) ) ) + +#define mm256_ror_32( v, c ) \ + _mm256_or_si256( _mm256_srli_epi32( v, c ), \ + _mm256_slli_epi32( v, 32-(c) ) ) + +#define mm256_rol_32( v, c ) \ + _mm256_or_si256( _mm256_slli_epi32( v, c ), \ + _mm256_srli_epi32( v, 32-(c) ) ) #define mm256_rorx2_64( v1, v0, c ) \ { \ @@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) _mm256_or_si256( _mm256_slli_epi16( v, c ), \ _mm256_srli_epi16( v, 16-(c) ) ) +// Deprecated. +#define mm256_rol_var_32( v, c ) \ + _mm256_or_si256( _mm256_slli_epi32( v, c ), \ + _mm256_srli_epi32( v, 32-(c) ) ) // // Rotate elements accross all lanes. @@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) // Rotate 256 bit vector by one 64 bit element #define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 ) - #define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 ) // Rotate 256 bit vector by one 32 bit element. @@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) m256_const_64( 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ) ) - // // Rotate elements within each 128 bit lane of 256 bit vector. @@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \ _mm256_castsi256_ps( b ), c ) ); - #define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) #define mm256_shuflr128_64 mm256_swap128_64 #define mm256_shufll128_64 mm256_swap128_64 @@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) { return _mm256_alignr_epi8( v, v, c ); } -// Swap 32 bit elements in each 64 bit lane. +// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit +// rotations for multiples of 8 bits. Uses faster ror/rol instructions when +// AVX512 is available. + #define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) #define mm256_shuflr64_32 mm256_swap64_32 #define mm256_shufll64_32 mm256_swap64_32 +#if defined(__AVX512VL__) + #define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 ) +#else + #define mm256_shuflr64_24( v ) \ + _mm256_shuffle_epi8( v, _mm256_set_epi64x( \ + 0x0a09080f0e0d0c0b, 0x0201000706050403, \ + 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) +#endif + +#if defined(__AVX512VL__) + #define mm256_shuflr64_16( v ) _mm256_ror_epi64( v, 16 ) +#else + #define mm256_shuflr64_16( v ) \ + _mm256_shuffle_epi8( v, _mm256_set_epi64x( \ + 0x09080f0e0d0c0b0a, 0x0100070605040302, \ + 0x09080f0e0d0c0b0a, 0x0100070605040302 ) ) +#endif + +#if defined(__AVX512VL__) + #define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 ) +#else + #define mm256_swap32_16( v ) \ + _mm256_shuffle_epi8( v, _mm256_set_epi64x( \ + 0x0d0c0f0e09080b0a, 0x0504070601000302, \ + 0x0d0c0f0e09080b0a, 0x0504070601000302 ) ) +#endif +#define mm256_shuflr32_16 mm256_swap32_16 +#define mm256_shufll32_16 mm256_swap32_16 + +#if defined(__AVX512VL__) + #define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 ) +#else + #define mm256_shuflr32_8( v ) \ + _mm256_shuffle_epi8( v, _mm256_set_epi64x( \ + 0x0c0f0e0d080b0a09, 0x0407060500030201, \ + 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) +#endif + // NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit // lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL + // AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if @@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \ } while(0) -// -// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified -// number of elements. Rotate is done in place, source arguments are -// overwritten. -// Some of these can use permute but appears to be slower. Maybe a Ryzen -// issue - -// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also -// makes these macros unnecessary. - -// continue using vror/vrol notation for now to avoid confusion with -// shufl2r/shufl2l macro functions available with AVX512. +// swap 256 bit vectors in place. +// This should be avoided, it's more efficient to switch references. #define mm256_swap512_256( v1, v2 ) \ v1 = _mm256_xor_si256( v1, v2 ); \ v2 = _mm256_xor_si256( v1, v2 ); \ diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 237af5f1..4c35df6b 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // Bit rotations. // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit -// elements and can be called directly. But they only accept immediate 8 -// for control arg. -// The workaround is a fraud, just a fluke of the compiler's optimizer. -// It fails without -O3. The compiler seems to unroll shift loops, eliminating -// the variable control, better than rotate loops. +// elements and can be called directly. // // _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32 // -// For convenience and consistency with AVX2 +// For convenience and consistency with AVX2 macros. #define mm512_ror_64 _mm512_ror_epi64 #define mm512_rol_64 _mm512_rol_epi64 #define mm512_ror_32 _mm512_ror_epi32 #define mm512_rol_32 _mm512_rol_epi32 -static inline __m512i mm512_ror_var_64( const __m512i v, const int c ) -{ - return _mm512_or_si512( _mm512_srli_epi64( v, c ), - _mm512_slli_epi64( v, 64-c ) ); -} - -static inline __m512i mm512_rol_var_64( const __m512i v, const int c ) -{ - return _mm512_or_si512( _mm512_slli_epi64( v, c ), - _mm512_srli_epi64( v, 64-c ) ); -} - -static inline __m512i mm512_ror_var_32( const __m512i v, const int c ) -{ - return _mm512_or_si512( _mm512_srli_epi32( v, c ), - _mm512_slli_epi32( v, 32-c ) ); -} - -static inline __m512i mm512_rol_var_32( const __m512i v, const int c ) -{ - return _mm512_or_si512( _mm512_slli_epi32( v, c ), - _mm512_srli_epi32( v, 32-c ) ); -} - -static inline __m512i mm512_ror_16( __m512i const v, const int c ) -{ - return _mm512_or_si512( _mm512_srli_epi16( v, c ), - _mm512_slli_epi16( v, 16-c ) ); -} - -static inline __m512i mm512_rol_16( const __m512i v, const int c ) -{ - return _mm512_or_si512( _mm512_slli_epi16( v, c ), - _mm512_srli_epi16( v, 16-c ) ); -} - // Rotations using a vector control index are very slow due to overhead // to generate the index vector. Repeated rotations using the same index // are better handled by the calling function where the index only needs @@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) static inline __m512i mm512_shuflr128_8( const __m512i v, const int c ) { return _mm512_alignr_epi8( v, v, c ); } -// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction -// but only with AVX512. Shuffle is just as fast and availble with AVX2 -// & SSE2. +// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all +// can be done with ror & rol. Defined only for convenience and consistency +// with AVX2 & SSE2 macros. + #define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) #define mm512_shuflr64_32 mm512_swap64_32 #define mm512_shufll64_32 mm512_swap64_32 -// Need good way to distinguish 1 input shuffles, 2 input shuffle functions, -// and 2 input 2 output shuffle macros. -// -// shuflr is 1 input -// shufl2r is 2 input ... -// Drop macros? They can easilly be rebuilt using shufl2 functions +#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 ) +#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 ) + +#define mm512_shuflr64_16( v ) _mm512_ror_epi64( v, 16 ) +#define mm512_shufll64_16( v ) _mm512_rol_epi64( v, 16 ) + +#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 ) +#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 ) + +#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 ) +#define mm512_shuflr32_16 mm512_swap32_16 +#define mm512_shufll32_16 mm512_swap32_16 + +#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 ) +#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 ) + // 2 input, 1 output -// Rotate concatenated { v1, v2 ) right or left and return v1. +// Concatenate { v1, v2 ) then rotate right or left and return the high +// 512 bits, ie rotated v1. #define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 ) #define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )