From 47cc5dcff519d0be1e206bfdc52121a44d345e98 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Sun, 10 Oct 2021 22:50:19 -0400 Subject: [PATCH] v3.18.1 --- INSTALL_LINUX | 22 +- RELEASE_NOTES | 16 +- algo/scrypt/scrypt-core-4way.c | 2886 ++++++++++++-------------------- algo/scrypt/scrypt.c | 456 +++-- algo/sha/sha-hash-4way.h | 3 - algo/sha/sha256-hash-4way.c | 881 ++-------- algo/shabal/shabal-hash-4way.c | 13 +- algo/swifftx/inttypes.h | 8 +- algo/swifftx/swifftx.c | 409 ++++- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 28 +- simd-utils/simd-256.h | 48 +- simd-utils/simd-512.h | 4 - 14 files changed, 2013 insertions(+), 2783 deletions(-) diff --git a/INSTALL_LINUX b/INSTALL_LINUX index a88f888c..24927b46 100644 --- a/INSTALL_LINUX +++ b/INSTALL_LINUX @@ -32,14 +32,26 @@ but different package names. $ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and -openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA -support depending on your CPU and compiler version: +openssl 1.1.0e or higher. -"-march=native" is always the best choice +znver1 and znver2 should be recognized on most recent version of GCC and +znver3 is expected with GCC 11. GCC 11 also includes rocketlake support. +In the meantime here are some suggestions to compile with new CPUs: -"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000. +"-march=native" is usually the best choice, used by build.sh. -"-msha" Add SHA to other tuning options +"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized. + +"-mcascadelake -msha" or +"-mcometlake -mavx512 -msha" can be used for Rocket Lake. + +Features can also be added individually: + +"-msha" adds support for HW accelerated sha256. + +"-mavx512" adds support for 512 bit vectors + +"-mvaes" add support for parallel AES Additional instructions for static compilalation can be found here: https://lxadm.com/Static_compilation_of_cpuminer diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 056491f7..ef3f912f 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,10 +65,24 @@ If not what makes it happen or not happen? Change Log ---------- +v3.18.1 + +More speed for scrypt: + - additional scryptn2 optimizations for all CPU architectures, + - AVX2 is now used by default on CPUS with SHA but not AVX512, + - scrypt:1024 performance lost in v3.18.0 is restored, + - AVX512 & AVX2 improvements to scrypt:1024. + +Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%. + +Issue #337: fixed a problem that could display negative stats values in the +first summary report if the report was forced prematurely due to a stratum +diff change. The stats will still be invalid but should display zeros. + v3.18.0 Complete rewrite of Scrypt code, optimized for large N factor (scryptn2): - - AVX512 & SHA support for SHA256, AVX512 has priority, + - AVX512 & SHA support for sha256, AVX512 has priority, - up to 50% increase in hashrate, - memory requirements reduced 30-60% depending on CPU architecture, - memory usage displayed at startup, diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c index 19ff9cdd..1039c3fc 100644 --- a/algo/scrypt/scrypt-core-4way.c +++ b/algo/scrypt/scrypt-core-4way.c @@ -116,23 +116,6 @@ do{ \ c1 = XOR( c1, tc ); \ } while (0); -// use 16 regs AVX, AVX2, 8 buf for AVX512? -#define ARX_4BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3, n ) \ -do{ \ - TYPE ta = ADD32( a2, a3 ); \ - TYPE tb = ADD32( b2, b3 ); \ - TYPE tc = ADD32( c2, c3 ); \ - TYPE td = ADD32( d2, d3 ); \ - ta = ROL32( ta, n ); \ - tb = ROL32( tb, n ); \ - tc = ROL32( tc, n ); \ - td = ROL32( td, n ); \ - a1 = XOR( a1, ta ); \ - b1 = XOR( b1, tb ); \ - c1 = XOR( c1, tc ); \ - d1 = XOR( d1, td ); \ -} while (0); - // Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 & // ROR_1X32 defined. @@ -208,95 +191,127 @@ do{ \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); -#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \ - ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \ - ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \ - ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \ - ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \ +// For use when fast bit rotate is not available. +// contains target specif instructions, only use with 128 bit vectrors. +#define SALSA_2ROUNDS_SIMD128_2BUF_SLOROT \ +do{ \ + TYPE TA = ADD32( XA0, XA3 ); \ + TYPE TB = ADD32( XB0, XB3 ); \ + TYPE T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 7 );\ + TB = _mm_srli_epi32( TB, 25 ); \ + XB1 = XOR( XB1, T ); \ + XB1 = XOR( XB1, TB ); \ +\ + TA = ADD32( XA1, XA0 ); \ + TB = ADD32( XB1, XB0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 );\ + TB = _mm_srli_epi32( TB, 23 );\ + XB2 = XOR( XB2, T ); \ + XB2 = XOR( XB2, TB ); \ +\ + TA = ADD32( XA2, XA1 ); \ + TB = ADD32( XB2, XB1 ); \ + T = _mm_slli_epi32( TA, 13); \ + TA = _mm_srli_epi32( TA, 19 ); \ XA1 = ROL_1X32( XA1 ); \ XB1 = ROL_1X32( XB1 ); \ - XA3 = ROR_1X32( XA3 ); \ - XB3 = ROR_1X32( XB3 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ + T = _mm_slli_epi32( TB, 13); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ +\ + TA = ADD32( XA3, XA2 ); \ + TB = ADD32( XB3, XB2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ - ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \ - ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \ - ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \ - ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); - -// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, -// XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3 ) -#define SALSA_2ROUNDS_SIMD128_4BUF \ - ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ - XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \ - ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ - XC2, XC1, XC0, XD2, XD1, XD0, 9 ); \ - ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ - XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \ - ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ - XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \ - XA1 = ROL_1X32( XA1 ); \ - XB1 = ROL_1X32( XB1 ); \ - XC1 = ROL_1X32( XC1 ); \ - XD1 = ROL_1X32( XD1 ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ +\ + TA = ADD32( XA0, XA1 ); \ + TB = ADD32( XB0, XB1 ); \ + T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ XA3 = ROR_1X32( XA3 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ + T = _mm_slli_epi32( TB, 7 ); \ + TB = _mm_srli_epi32( TB, 25 ); \ XB3 = ROR_1X32( XB3 ); \ - XC3 = ROR_1X32( XC3 ); \ - XD3 = ROR_1X32( XD3 ); \ - XA2 = SWAP_64( XA2 ); \ - XB2 = SWAP_64( XB2 ); \ - XC2 = SWAP_64( XC2 ); \ - XD2 = SWAP_64( XD2 ); \ - ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ - XC3, XC0, XC1, XD3, XD0, XD1, 7 ); \ - ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ - XC2, XC3, XC0, XD2, XD3, XD0, 9 ); \ - ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ - XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \ - ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ - XC0, XC1, XC2, XD0, XD1, XD2, 18 ); \ + XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ +\ + TA = ADD32( XA3, XA0 ); \ + TB = ADD32( XB3, XB0 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 ); \ + TB = _mm_srli_epi32( TB, 23 ); \ + XB2 = XOR( XB2, T ); \ + XB2 = XOR( XB2, TB ); \ +\ + TA = ADD32( XA2, XA3 ); \ + TB = ADD32( XB2, XB3 ); \ + T = _mm_slli_epi32( TA, 13 ); \ + TA = _mm_srli_epi32( TA, 19 ); \ XA3 = ROL_1X32( XA3 ); \ XB3 = ROL_1X32( XB3 ); \ - XC3 = ROL_1X32( XC3 ); \ - XD3 = ROL_1X32( XD3 ); \ - XA1 = ROR_1X32( XA1 ); \ - XB1 = ROR_1X32( XB1 ); \ - XC1 = ROR_1X32( XC1 ); \ - XD1 = ROR_1X32( XD1 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 13 ); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB1 = XOR( XB1, T ); \ + XB1 = XOR( XB1, TB ); \ +\ + TA = ADD32( XA1, XA2 ); \ + TB = ADD32( XB1, XB2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ - XC2 = SWAP_64( XC2 ); \ - XD2 = SWAP_64( XD2 ); - -#define SALSA_2ROUNDS_FINAL_SIMD128_4BUF \ - ARX_4BUF( XA1, XA0, XA3, XB1, XB0, XB3, \ - XC1, XC0, XC3, XD1, XD0, XD3, 7 ); \ - ARX_4BUF( XA2, XA1, XA0, XB2, XB1, XB0, \ - XC2, XC1, XC0, XD2, XD1, XD0, 9 ); \ - ARX_4BUF( XA3, XA2, XA1, XB3, XB2, XB1, \ - XC3, XC2, XC1, XD3, XD2, XD1, 13 ); \ - ARX_4BUF( XA0, XA3, XA2, XB0, XB3, XB2, \ - XC0, XC3, XC2, XD0, XD3, XD2, 18 ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ + XA1 = ROR_1X32( XA1 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ + XB1 = ROR_1X32( XB1 ); \ +} while (0); + +#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \ + ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \ + ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \ + ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \ + ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \ XA1 = ROL_1X32( XA1 ); \ XB1 = ROL_1X32( XB1 ); \ - XC1 = ROL_1X32( XC1 ); \ - XD1 = ROL_1X32( XD1 ); \ XA3 = ROR_1X32( XA3 ); \ XB3 = ROR_1X32( XB3 ); \ - XC3 = ROR_1X32( XC3 ); \ - XD3 = ROR_1X32( XD3 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ - XC2 = SWAP_64( XC2 ); \ - XD2 = SWAP_64( XD2 ); \ - ARX_4BUF( XA3, XA0, XA1, XB3, XB0, XB1, \ - XC3, XC0, XC1, XD3, XD0, XD1, 7 ); \ - ARX_4BUF( XA2, XA3, XA0, XB2, XB3, XB0, \ - XC2, XC3, XC0, XD2, XD3, XD0, 9 ); \ - ARX_4BUF( XA1, XA2, XA3, XB1, XB2, XB3, \ - XC1, XC2, XC3, XD1, XD2, XD3, 13 ); \ - ARX_4BUF( XA0, XA1, XA2, XB0, XB1, XB2, \ - XC0, XC1, XC2, XD0, XD1, XD2, 18 ); + ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \ + ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \ + ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \ + ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); + // Inlined ARX #define SALSA_2ROUNDS_SIMD128_3BUF \ @@ -402,7 +417,8 @@ do{ \ // slow rol, an attempt to optimze non-avx512 bit rotations -#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROL \ +// Contains target specific instructions, only for use with 128 bit vectors +#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \ do{ \ TYPE TA = ADD32( XA0, XA3 ); \ TYPE TB = ADD32( XB0, XB3 ); \ @@ -410,14 +426,14 @@ do{ \ TYPE T = _mm_slli_epi32( TA, 7 ); \ TA = _mm_srli_epi32( TA, 25 ); \ XA1 = XOR( XA1, T ); \ - T = _mm_slli_epi32( TB, 7 );\ XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 7 );\ TB = _mm_srli_epi32( TB, 25 ); \ XB1 = XOR( XB1, T ); \ - T = _mm_slli_epi32( TC, 7 );\ XB1 = XOR( XB1, TB ); \ - XC1 = XOR( XC1, T ); \ + T = _mm_slli_epi32( TC, 7 );\ TC = _mm_srli_epi32( TC, 25 );\ + XC1 = XOR( XC1, T ); \ XC1 = XOR( XC1, TC ); \ \ TA = ADD32( XA1, XA0 ); \ @@ -426,14 +442,14 @@ do{ \ T = _mm_slli_epi32( TA, 9 ); \ TA = _mm_srli_epi32( TA, 23 ); \ XA2 = XOR( XA2, T ); \ + XA2 = XOR( XA2, TA ); \ T = _mm_slli_epi32( TB, 9 );\ TB = _mm_srli_epi32( TB, 23 );\ - XA2 = XOR( XA2, TA ); \ XB2 = XOR( XB2, T ); \ - T = _mm_slli_epi32( TC, 9 );\ XB2 = XOR( XB2, TB ); \ - XC2 = XOR( XC2, T ); \ + T = _mm_slli_epi32( TC, 9 );\ TC = _mm_srli_epi32( TC, 23 );\ + XC2 = XOR( XC2, T ); \ XC2 = XOR( XC2, TC ); \ \ TA = ADD32( XA2, XA1 ); \ @@ -442,17 +458,17 @@ do{ \ T = _mm_slli_epi32( TA, 13); \ TA = _mm_srli_epi32( TA, 19 ); \ XA1 = ROL_1X32( XA1 ); \ - XA3 = XOR( XA3, T ); \ XB1 = ROL_1X32( XB1 ); \ + XC1 = ROL_1X32( XC1 ); \ + XA3 = XOR( XA3, T ); \ + XA3 = XOR( XA3, TA ); \ T = _mm_slli_epi32( TB, 13); \ TB = _mm_srli_epi32( TB, 19 ); \ - XA3 = XOR( XA3, TA ); \ XB3 = XOR( XB3, T ); \ + XB3 = XOR( XB3, TB ); \ T = _mm_slli_epi32( TC, 13); \ TC = _mm_srli_epi32( TC, 19 ); \ - XB3 = XOR( XB3, TB ); \ XC3 = XOR( XC3, T ); \ - XC1 = ROL_1X32( XC1 ); \ XC3 = XOR( XC3, TC ); \ \ TA = ADD32( XA3, XA2 ); \ @@ -461,70 +477,94 @@ do{ \ T = _mm_slli_epi32( TA, 18 ); \ TA = _mm_srli_epi32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ + XB2 = SWAP_64( XB2 ); \ + XC2 = SWAP_64( XC2 ); \ XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ T = _mm_slli_epi32( TB, 18 ); \ - XB2 = SWAP_64( XB2 ); \ TB = _mm_srli_epi32( TB, 14 ); \ XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ T = _mm_slli_epi32( TC, 18 ); \ - XA0 = XOR( XA0, TA ); \ TC = _mm_srli_epi32( TC, 14 ); \ XC0 = XOR( XC0, T ); \ - XB0 = XOR( XB0, TB ); \ - XC2 = SWAP_64( XC2 ); \ XC0 = XOR( XC0, TC ); \ \ TA = ADD32( XA0, XA1 ); \ TB = ADD32( XB0, XB1 ); \ TC = ADD32( XC0, XC1 ); \ - TA = ROL32( TA, 7 ); \ + T = _mm_slli_epi32( TA, 7 ); \ + TA = _mm_srli_epi32( TA, 25 ); \ XA3 = ROR_1X32( XA3 ); \ + XA3 = XOR( XA3, T ); \ XA3 = XOR( XA3, TA ); \ - TB = ROL32( TB, 7 ); \ + T = _mm_slli_epi32( TB, 7 ); \ + TB = _mm_srli_epi32( TB, 25 ); \ XB3 = ROR_1X32( XB3 ); \ + XB3 = XOR( XB3, T ); \ XB3 = XOR( XB3, TB ); \ - TC = ROL32( TC, 7 ); \ + T = _mm_slli_epi32( TC, 7 ); \ + TC = _mm_srli_epi32( TC, 25 ); \ XC3 = ROR_1X32( XC3 ); \ + XC3 = XOR( XC3, T ); \ XC3 = XOR( XC3, TC ); \ \ TA = ADD32( XA3, XA0 ); \ TB = ADD32( XB3, XB0 ); \ TC = ADD32( XC3, XC0 ); \ - TA = ROL32( TA, 9 ); \ - TB = ROL32( TB, 9 ); \ - TC = ROL32( TC, 9 ); \ + T = _mm_slli_epi32( TA, 9 ); \ + TA = _mm_srli_epi32( TA, 23 ); \ + XA2 = XOR( XA2, T ); \ XA2 = XOR( XA2, TA ); \ + T = _mm_slli_epi32( TB, 9 ); \ + TB = _mm_srli_epi32( TB, 23 ); \ + XB2 = XOR( XB2, T ); \ XB2 = XOR( XB2, TB ); \ + T = _mm_slli_epi32( TC, 9 ); \ + TC = _mm_srli_epi32( TC, 23 ); \ + XC2 = XOR( XC2, T ); \ XC2 = XOR( XC2, TC ); \ \ TA = ADD32( XA2, XA3 ); \ TB = ADD32( XB2, XB3 ); \ - TA = ROL32( TA, 13 ); \ TC = ADD32( XC2, XC3 ); \ + T = _mm_slli_epi32( TA, 13 ); \ + TA = _mm_srli_epi32( TA, 19 ); \ XA3 = ROL_1X32( XA3 ); \ - TB = ROL32( TB, 13 ); \ XB3 = ROL_1X32( XB3 ); \ - XA1 = XOR( XA1, TA ); \ - TC = ROL32( TC, 13 ); \ XC3 = ROL_1X32( XC3 ); \ + XA1 = XOR( XA1, T ); \ + XA1 = XOR( XA1, TA ); \ + T = _mm_slli_epi32( TB, 13 ); \ + TB = _mm_srli_epi32( TB, 19 ); \ + XB1 = XOR( XB1, T ); \ XB1 = XOR( XB1, TB ); \ + T = _mm_slli_epi32( TC, 13 ); \ + TC = _mm_srli_epi32( TC, 19 ); \ + XC1 = XOR( XC1, T ); \ XC1 = XOR( XC1, TC ); \ \ TA = ADD32( XA1, XA2 ); \ TB = ADD32( XB1, XB2 ); \ - TA = ROL32( TA, 18); \ TC = ADD32( XC1, XC2 ); \ + T = _mm_slli_epi32( TA, 18 ); \ + TA = _mm_srli_epi32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ - TB = ROL32( TB, 18); \ - XA0 = XOR( XA0, TA ); \ XB2 = SWAP_64( XB2 ); \ - TC = ROL32( TC, 18); \ - XB0 = XOR( XB0, TB ); \ + XA0 = XOR( XA0, T ); \ + XA0 = XOR( XA0, TA ); \ + T = _mm_slli_epi32( TB, 18 ); \ + TB = _mm_srli_epi32( TB, 14 ); \ XC2 = SWAP_64( XC2 ); \ XA1 = ROR_1X32( XA1 ); \ + XB0 = XOR( XB0, T ); \ + XB0 = XOR( XB0, TB ); \ + T = _mm_slli_epi32( TC, 18 ); \ + TC = _mm_srli_epi32( TC, 14 ); \ XB1 = ROR_1X32( XB1 ); \ - XC0 = XOR( XC0, TC ); \ XC1 = ROR_1X32( XC1 ); \ + XC0 = XOR( XC0, T ); \ + XC0 = XOR( XC0, TC ); \ } while (0); @@ -614,6 +654,12 @@ do{ \ SALSA_2ROUNDS_SIMD128_2BUF; \ SALSA_2ROUNDS_SIMD128_2BUF; +#define SALSA_8ROUNDS_SIMD128_2BUF_SLOROT \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; + #define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \ SALSA_2ROUNDS_SIMD128_2BUF; \ SALSA_2ROUNDS_SIMD128_2BUF; \ @@ -626,6 +672,12 @@ do{ \ SALSA_2ROUNDS_SIMD128_3BUF; \ SALSA_2ROUNDS_SIMD128_3BUF; +#define SALSA_8ROUNDS_SIMD128_3BUF_SLOROT \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \ + SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; + #define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \ SALSA_2ROUNDS_SIMD128_3BUF; \ SALSA_2ROUNDS_SIMD128_3BUF; \ @@ -746,13 +798,13 @@ static void xor_salsa8_16way( __m512i * const B, const __m512i * const C) void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 32], X, 128*16 ); + memcpy( &V[n * 32], X, 128*16 ); xor_salsa8_16way( &X[ 0], &X[16] ); xor_salsa8_16way( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { m512_ovly *vptr[16]; // pointer to V offset for each lane m512_ovly *x16 = (m512_ovly*)(&X[16]); @@ -765,12 +817,12 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ) vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); } - for ( int k = 0; k < 32; k++ ) + for ( int i = 0; i < 32; i++ ) { m512_ovly v; // V value assembled from different indexes for ( int l = 0; l < 8; l++ ) - v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l]; - X[ k ] = _mm512_xor_si512( X[ k ], v.m512 ); + v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; + X[i] = _mm512_xor_si512( X[i], v.m512 ); } xor_salsa8_16way( &X[ 0], &X[16] ); @@ -852,14 +904,14 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c ) void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 32], X, 4*128 ); + memcpy( &V[n * 32], X, 4*128 ); salsa8_simd128_4way( &X[ 0], &X[16] ); salsa8_simd128_4way( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { uint32_t x16[4]; // index into V for each lane memcpy( x16, &X[16], 16 ); @@ -869,12 +921,12 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) x16[3] = 32 * ( x16[3] & ( N-1) ); m128_ovly *v = (m128_ovly*)V; - for( int k = 0; k < 32; k++ ) + for( int i = 0; i < 32; i++ ) { - X[k] = _mm_xor_si128( X[k], _mm_set_epi32( v[ x16[3] + k ].u32[3], - v[ x16[2] + k ].u32[2], - v[ x16[1] + k ].u32[1], - v[ x16[0] + k ].u32[0] ) ); + X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3], + v[ x16[2] + i ].u32[2], + v[ x16[1] + i ].u32[1], + v[ x16[0] + i ].u32[0] ) ); } salsa8_simd128_4way( &X[ 0], &X[16] ); @@ -882,49 +934,60 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) } } -// not working, occasional accepted shares, not up to date. +// 4x memory usage +// Working // 4x128 interleaving -static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) +static void salsa_shuffle_4way_simd128( __m512i *X ) { - __m512i X0, X1, X2, X3; - uint32_t *b = (uint32_t*)B; - m512_ovly y[4], z[4]; - - // mix C into B then shuffle B into X - B[0] = _mm512_xor_si512( B[0], C[0] ); - B[1] = _mm512_xor_si512( B[1], C[1] ); - B[2] = _mm512_xor_si512( B[2], C[2] ); - B[3] = _mm512_xor_si512( B[3], C[3] ); + __m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3; - // { l3u15, l3u10, l3u5, l3u0, l2u15, l2u10, l2u5, l2u0, - // l1u15, l1u10, l1u5, l1u0, l0u15, l0u10, l0u5, l0u0 } + Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] ); + Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] ); - // b index = row index + lane index + unit index - // = ( 8 * (u/4) ) + ( 4*l ) + ( u%4 ) + Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] ); + Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] ); - X0 = _mm512_set_epi32( b[63], b[46], b[29], b[12], // lane 3[3:0] - b[59], b[42], b[25], b[ 8], // lane 2[3:0] - b[55], b[38], b[21], b[ 4], // lane 1[3:0] - b[51], b[34], b[17], b[ 0] ); // lane 0[3:0] + Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] ); + Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] ); - X1 = _mm512_set_epi32( b[15], b[62], b[45], b[28], - b[11], b[58], b[41], b[24], - b[ 7], b[54], b[37], b[20], - b[ 3], b[50], b[33], b[16] ); // lane 0[7:4] + Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] ); + Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] ); - X2 = _mm512_set_epi32( b[31], b[14], b[61], b[44], - b[27], b[10], b[57], b[40], - b[23], b[ 6], b[53], b[36], - b[19], b[ 2], b[49], b[32] ); + X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 ); + X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 ); + X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 ); + X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 ); +} - X3 = _mm512_set_epi32( b[47], b[30], b[13], b[60], - b[43], b[26], b[ 9], b[56], - b[39], b[22], b[ 5], b[52], - b[35], b[18], b[ 1], b[48] ); +static void salsa_unshuffle_4way_simd128( __m512i *X ) +{ + __m512i Y0, Y1, Y2, Y3; + + Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] ); + Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] ); + Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] ); + Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] ); + + Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] ); + Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] ); + Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] ); + Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] ); + + X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] ); + X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] ); + X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] ); + X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] ); +} +static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) +{ + __m512i X0, X1, X2, X3; + X0 = B[0] = _mm512_xor_si512( B[0], C[0] ); + X1 = B[1] = _mm512_xor_si512( B[1], C[1] ); + X2 = B[2] = _mm512_xor_si512( B[2], C[2] ); + X3 = B[3] = _mm512_xor_si512( B[3], C[3] ); - // define targets for macros used in round function template #define ROL_1X32 mm512_shufll128_32 // shuffle within 128 bit lanes #define ROR_1X32 mm512_shuflr128_32 #define SWAP_64 mm512_swap128_64 @@ -932,7 +995,7 @@ static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) #define ADD32 _mm512_add_epi32 #define XOR _mm512_xor_si512 - SALSA_8ROUNDS_FINAL_SIMD128; + SALSA_8ROUNDS_SIMD128; #undef ROL_1X32 #undef ROR_1X32 @@ -941,123 +1004,25 @@ static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) #undef ADD32 #undef XOR - y[0].m512 = X0; - y[1].m512 = X1; - y[2].m512 = X2; - y[3].m512 = X3; - - // lane 0 - z[0].u32[ 0 ] = y[0].u32[ 0]; - z[0].u32[ 3 ] = y[1].u32[ 0]; - z[0].u32[ 2 ] = y[2].u32[ 0]; - z[0].u32[ 1 ] = y[3].u32[ 0]; - - // lane 1 - z[0].u32[ 0+ 4 ] = y[0].u32[ 4]; - z[0].u32[ 3+ 4 ] = y[1].u32[ 4]; - z[0].u32[ 2+ 4 ] = y[2].u32[ 4]; - z[0].u32[ 1+ 4 ] = y[3].u32[ 4]; - - // lane 2 - z[0].u32[ 0+ 8 ] = y[0].u32[ 8]; - z[0].u32[ 3+ 8 ] = y[1].u32[ 8]; - z[0].u32[ 2+ 8 ] = y[2].u32[ 8]; - z[0].u32[ 1+ 8 ] = y[3].u32[ 8]; - - // lane 3 - z[0].u32[ 0+12 ] = y[0].u32[12]; - z[0].u32[ 3+12 ] = y[1].u32[12]; - z[0].u32[ 2+12 ] = y[2].u32[12]; - z[0].u32[ 1+12 ] = y[3].u32[12]; - - // lane 0 - z[1].u32[ 1 ] = y[0].u32[ 1]; - z[1].u32[ 0 ] = y[1].u32[ 1]; - z[1].u32[ 3 ] = y[2].u32[ 1]; - z[1].u32[ 2 ] = y[3].u32[ 1]; - - //lane 1 - z[1].u32[ 1+ 4 ] = y[0].u32[ 5]; - z[1].u32[ 0+ 4 ] = y[1].u32[ 5]; - z[1].u32[ 3+ 4 ] = y[2].u32[ 5]; - z[1].u32[ 2+ 4 ] = y[3].u32[ 5]; - - // lane 2 - z[1].u32[ 1+ 8 ] = y[0].u32[ 9]; - z[1].u32[ 0+ 8 ] = y[1].u32[ 9]; - z[1].u32[ 3+ 8 ] = y[2].u32[ 9]; - z[1].u32[ 2+ 8 ] = y[3].u32[ 9]; - - // lane 3 - z[1].u32[ 1+12 ] = y[0].u32[13]; - z[1].u32[ 0+12 ] = y[1].u32[13]; - z[1].u32[ 3+12 ] = y[2].u32[13]; - z[1].u32[ 2+12 ] = y[3].u32[13]; - - // lane 0 - z[2].u32[ 2 ] = y[0].u32[2]; - z[2].u32[ 1 ] = y[1].u32[2]; - z[2].u32[ 0 ] = y[2].u32[2]; - z[2].u32[ 3 ] = y[3].u32[2]; - - // lane 1 - z[2].u32[ 2+ 4 ] = y[0].u32[6]; - z[2].u32[ 1+ 4 ] = y[1].u32[6]; - z[2].u32[ 0+ 4 ] = y[2].u32[6]; - z[2].u32[ 3+ 4 ] = y[3].u32[6]; - - // lane 2 - z[2].u32[ 2+ 8 ] = y[0].u32[10]; - z[2].u32[ 1+ 8 ] = y[1].u32[10]; - z[2].u32[ 0+ 8 ] = y[2].u32[10]; - z[2].u32[ 3+ 8 ] = y[3].u32[10]; - - // lane 3 - z[2].u32[ 2+12 ] = y[0].u32[14]; - z[2].u32[ 1+12 ] = y[1].u32[14]; - z[2].u32[ 0+12 ] = y[2].u32[14]; - z[2].u32[ 3+12 ] = y[3].u32[14]; - - // lane 0 - z[3].u32[ 3 ] = y[0].u32[ 3]; - z[3].u32[ 2 ] = y[1].u32[ 3]; - z[3].u32[ 1 ] = y[2].u32[ 3]; - z[3].u32[ 0 ] = y[3].u32[ 3]; - - // lane 1 - z[3].u32[ 3+ 4 ] = y[0].u32[ 7]; - z[3].u32[ 2+ 4 ] = y[1].u32[ 7]; - z[3].u32[ 1+ 4 ] = y[2].u32[ 7]; - z[3].u32[ 0+ 4 ] = y[3].u32[ 7]; - - // lane 2 - z[3].u32[ 3+ 8 ] = y[0].u32[11]; - z[3].u32[ 2+ 8 ] = y[1].u32[11]; - z[3].u32[ 1+ 8 ] = y[2].u32[11]; - z[3].u32[ 0+ 8 ] = y[3].u32[11]; - - // lane 1 - z[3].u32[ 3+12 ] = y[0].u32[15]; - z[3].u32[ 2+12 ] = y[1].u32[15]; - z[3].u32[ 1+12 ] = y[2].u32[15]; - z[3].u32[ 0+12 ] = y[3].u32[15]; - - B[0] = _mm512_add_epi32( B[0], z[0].m512 ); - B[1] = _mm512_add_epi32( B[1], z[1].m512 ); - B[2] = _mm512_add_epi32( B[2], z[2].m512 ); - B[3] = _mm512_add_epi32( B[3], z[3].m512 ); + B[0] = _mm512_add_epi32( B[0], X0 ); + B[1] = _mm512_add_epi32( B[1], X1 ); + B[2] = _mm512_add_epi32( B[2], X2 ); + B[3] = _mm512_add_epi32( B[3], X3 ); } void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + salsa_shuffle_4way_simd128( X ); + salsa_shuffle_4way_simd128( X+4 ); + + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 8], X, 128*4 ); + memcpy( &V[n * 8], X, 128*4 ); salsa8_4way_simd128( &X[0], &X[4] ); salsa8_4way_simd128( &X[4], &X[0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { m512_ovly x16; x16 = ( (m512_ovly*)X )[4]; @@ -1066,25 +1031,22 @@ void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ) uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) ); uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) ); - for ( int k = 0; k < 8; k++ ) - X[k] = _mm512_xor_si512( X[k], m512_const_128( - ( (m512_ovly*)V )[ j3+k ].m128[3], - ( (m512_ovly*)V )[ j2+k ].m128[2], - ( (m512_ovly*)V )[ j1+k ].m128[1], - ( (m512_ovly*)V )[ j0+k ].m128[0] ) ); + for ( int i = 0; i < 8; i++ ) + { + __m512i v10 = _mm512_mask_blend_epi32( 0x000f, V[ j1+i ], V[ j0+i ] ); + __m512i v32 = _mm512_mask_blend_epi32( 0x0f00, V[ j3+i ], V[ j2+i ] ); + X[i] = _mm512_xor_si512( X[i], _mm512_mask_blend_epi32( 0x00ff, + v32, v10 ) ); + } -/* - for ( int k = 0; k < 8; k++ ) - X[k] = _mm512_xor_si512( X[k], m512_diagonal128_32( - V[ j3+k ], V[ j2+k ], V[ j1+k ], V[ j0+k ] ) ); -*/ salsa8_4way_simd128( &X[0], &X[4] ); salsa8_4way_simd128( &X[4], &X[0] ); } + + salsa_unshuffle_4way_simd128( X ); + salsa_unshuffle_4way_simd128( X+4 ); } - - #endif // AVX512 #if defined(__AVX2__) @@ -1142,14 +1104,14 @@ static void salsa8_8way( __m256i * const B, const __m256i * const C ) void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 32], X, 128*8 ); + memcpy( &V[n * 32], X, 128*8 ); salsa8_8way( &X[ 0], &X[16] ); salsa8_8way( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { m256_ovly *vptr[8]; // pointer to V offset for each lane m256_ovly *x16 = (m256_ovly*)(&X[16]); @@ -1162,12 +1124,12 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N ) vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); } - for ( int k = 0; k < 32; k++ ) + for ( int i = 0; i < 32; i++ ) { m256_ovly v; // V value assembled from different indexes for ( int l = 0; l < 8; l++ ) - v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l]; - X[ k ] = _mm256_xor_si256( X[ k ], v.m256 ); + v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; + X[i] = _mm256_xor_si256( X[i], v.m256 ); } salsa8_8way( &X[ 0], &X[16] ); @@ -1176,7 +1138,7 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N ) } // 2x memory usage -// Working, not up to date, needs stream optimization. +// Working // Essentially Pooler 6way // 2x128 interleaved simd128 // ------- lane 1 ------- ------- lane 0 ------- @@ -1185,31 +1147,56 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N ) // { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16] // { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24] -static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C) +static void salsa_shuffle_2way_simd128( __m256i *X ) { - __m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + __m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3; - // mix C into B then shuffle B into X - B[0] = _mm256_xor_si256( B[0], C[0] ); - B[1] = _mm256_xor_si256( B[1], C[1] ); - B[2] = _mm256_xor_si256( B[2], C[2] ); - B[3] = _mm256_xor_si256( B[3], C[3] ); + Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 ); + Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 ); + + Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 ); + Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 ); + + Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 ); + Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 ); + + Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 ); + Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 ); - Y0 = _mm256_blend_epi32( B[1], B[0], 0x11 ); - X0 = _mm256_blend_epi32( B[3], B[2], 0x44 ); - X0 = _mm256_blend_epi32( X0, Y0, 0x33); + X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 ); + X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 ); + X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 ); + X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 ); +} - Y1 = _mm256_blend_epi32( B[2], B[1], 0x11 ); - X1 = _mm256_blend_epi32( B[0], B[3], 0x44 ); - X1 = _mm256_blend_epi32( X1, Y1, 0x33 ); +static void salsa_unshuffle_2way_simd128( __m256i *X ) +{ + __m256i Y0, Y1, Y2, Y3; + + Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 ); + Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 ); + Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 ); + Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 ); + + Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 ); + Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 ); + Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 ); + Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 ); + + X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 ); + X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 ); + X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 ); + X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 ); +} - Y2 = _mm256_blend_epi32( B[3], B[2], 0x11 ); - X2 = _mm256_blend_epi32( B[1], B[0], 0x44 ); - X2 = _mm256_blend_epi32( X2, Y2, 0x33 ); +static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C) +{ + __m256i X0, X1, X2, X3; - Y3 = _mm256_blend_epi32( B[0], B[3], 0x11 ); - X3 = _mm256_blend_epi32( B[2], B[1], 0x44 ); - X3 = _mm256_blend_epi32( X3, Y3, 0x33 ); + X0 = B[0] = _mm256_xor_si256( B[0], C[0] ); + X1 = B[1] = _mm256_xor_si256( B[1], C[1] ); + X2 = B[2] = _mm256_xor_si256( B[2], C[2] ); + X3 = B[3] = _mm256_xor_si256( B[3], C[3] ); // define targets for macros used in round function template #define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes @@ -1228,52 +1215,41 @@ static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C) #undef ADD32 #undef XOR - // init with X0 then blend in the other elements + B[0] = _mm256_add_epi32( B[0], X0 ); + B[1] = _mm256_add_epi32( B[1], X1 ); + B[2] = _mm256_add_epi32( B[2], X2 ); + B[3] = _mm256_add_epi32( B[3], X3 ); +} - Y0 = _mm256_blend_epi32( X0, X1, 0x88 ); - Y1 = _mm256_blend_epi32( X0, X1, 0x11 ); - Y2 = _mm256_blend_epi32( X0, X1, 0x22 ); - Y3 = _mm256_blend_epi32( X0, X1, 0x44 ); +void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N ) +{ + salsa_shuffle_2way_simd128( X ); + salsa_shuffle_2way_simd128( X+4 ); - Y0 = _mm256_blend_epi32( Y0, X2, 0x44 ); - Y1 = _mm256_blend_epi32( Y1, X2, 0x88 ); - Y2 = _mm256_blend_epi32( Y2, X2, 0x11 ); - Y3 = _mm256_blend_epi32( Y3, X2, 0x22 ); - - Y0 = _mm256_blend_epi32( Y0, X3, 0x22 ); - Y1 = _mm256_blend_epi32( Y1, X3, 0x44 ); - Y2 = _mm256_blend_epi32( Y2, X3, 0x88 ); - Y3 = _mm256_blend_epi32( Y3, X3, 0x11 ); - - B[0] = _mm256_add_epi32( B[0], Y0 ); - B[1] = _mm256_add_epi32( B[1], Y1 ); - B[2] = _mm256_add_epi32( B[2], Y2 ); - B[3] = _mm256_add_epi32( B[3], Y3 ); -} - -void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N ) -{ - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 8], X, 128*2 ); + memcpy( &V[n * 8], X, 128*2 ); salsa8_2way_simd128( &X[0], &X[4] ); salsa8_2way_simd128( &X[4], &X[0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { m256_ovly x16; x16 = ( (m256_ovly*)X )[4]; uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) ); uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) ); - for ( int k = 0; k < 8; k++ ) - X[k] = _mm256_xor_si256( X[k], _mm256_blend_epi32( V[ j1+k ], - V[ j0+k ], 0x0f ) ); + for ( int i = 0; i < 8; i++ ) + X[i] = _mm256_xor_si256( X[i], _mm256_blend_epi32( V[ j1+i ], + V[ j0+i ], 0x0f ) ); salsa8_2way_simd128( &X[0], &X[4] ); salsa8_2way_simd128( &X[4], &X[0] ); } + + salsa_unshuffle_2way_simd128( X ); + salsa_unshuffle_2way_simd128( X+4 ); } // Working @@ -1386,17 +1362,17 @@ void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N ) __m256i *V0 = V; __m256i *V1 = V + 8*N; - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - for ( int k = 0; k < 8; k++ ) + for ( int i = 0; i < 8; i++ ) { - _mm256_stream_si256( V0 + i*8 + k, X0[k] ); - _mm256_stream_si256( V1 + i*8 + k, X1[k] ); + _mm256_stream_si256( V0 + n*8 + i, X0[i] ); + _mm256_stream_si256( V1 + n*8 + i, X1[i] ); } salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] ); salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { const m256_ovly x16a = ( (m256_ovly*)X0 )[4]; const m256_ovly x16b = ( (m256_ovly*)X1 )[4]; @@ -1406,25 +1382,16 @@ void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N ) const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) ); const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) ); - for ( int k = 0; k < 8; k++ ) + for ( int i = 0; i < 8; i++ ) { - const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + k ); - const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + k ); - const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + k ); - const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + k ); - X0[k] = _mm256_xor_si256( X0[k], + const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + i ); + const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + i ); + const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + i ); + const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + i ); + X0[i] = _mm256_xor_si256( X0[i], _mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) ); - X1[k] = _mm256_xor_si256( X1[k], + X1[i] = _mm256_xor_si256( X1[i], _mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) ); - - -/* - X0[k] = _mm256_xor_si256( X0[k], - _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) ); - X1[k] = _mm256_xor_si256( X1[k], - _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) ); -*/ - } salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] ); @@ -1577,17 +1544,17 @@ void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N ) __m256i *V1 = V + 8*N; __m256i *V2 = V + 16*N; - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V0[i * 8], X0, 128*2 ); - memcpy( &V1[i * 8], X1, 128*2 ); - memcpy( &V2[i * 8], X2, 128*2 ); + memcpy( &V0[n * 8], X0, 128*2 ); + memcpy( &V1[n * 8], X1, 128*2 ); + memcpy( &V2[n * 8], X2, 128*2 ); salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], &X0[4], &X1[4], &X2[4] ); salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4], &X0[0], &X1[0], &X2[0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { m256_ovly x16a, x16b, x16c; x16a = ( (m256_ovly*)X0 )[4]; @@ -1601,14 +1568,14 @@ void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N ) uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) ); uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) ); - for ( int k = 0; k < 8; k++ ) + for ( int i = 0; i < 8; i++ ) { - X0[k] = _mm256_xor_si256( X0[k], - _mm256_blend_epi32( V0[ j1a+k ], V0[ j0a+k ], 0x0f ) ); - X1[k] = _mm256_xor_si256( X1[k], - _mm256_blend_epi32( V1[ j1b+k ], V1[ j0b+k ], 0x0f ) ); - X2[k] = _mm256_xor_si256( X2[k], - _mm256_blend_epi32( V2[ j1c+k ], V2[ j0c+k ], 0x0f ) ); + X0[i] = _mm256_xor_si256( X0[i], + _mm256_blend_epi32( V0[ j1a+i ], V0[ j0a+i ], 0x0f ) ); + X1[i] = _mm256_xor_si256( X1[i], + _mm256_blend_epi32( V1[ j1b+i ], V1[ j0b+i ], 0x0f ) ); + X2[i] = _mm256_xor_si256( X2[i], + _mm256_blend_epi32( V2[ j1c+i ], V2[ j0c+i ], 0x0f ) ); } salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0], @@ -1707,23 +1674,23 @@ static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c ) void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - for ( int k = 0; k < 8; k++ ) - _mm256_stream_si256( (__m256i*)V + i*8 + k, casti_m256i( X, k ) ); + for ( int i = 0; i < 8; i++ ) + _mm256_stream_si256( (__m256i*)V + n*8 + i, casti_m256i( X, i ) ); salsa8_simd128_2way( &X[ 0], &X[16] ); salsa8_simd128_2way( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { // need 2 J's const uint32_t j0 = 32 * ( (uint32_t)( X[16] ) & ( N-1 ) ); const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) ); - for ( int k = 0; k < 32; k++ ) - X[k] ^= ( ( V[ j1 + k ] & 0xffffffff00000000 ) - | ( V[ j0 + k ] & 0x00000000ffffffff ) ); + for ( int i = 0; i < 32; i++ ) + X[i] ^= ( ( V[ j1 + i ] & 0xffffffff00000000 ) + | ( V[ j0 + i ] & 0x00000000ffffffff ) ); salsa8_simd128_2way( &X[ 0], &X[16] ); salsa8_simd128_2way( &X[16], &X[ 0] ); @@ -1845,18 +1812,18 @@ void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N ) uint64_t *V0 = V; uint64_t *V1 = V + 32*N; - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - for ( int k = 0; k < 8; k++ ) + for ( int i = 0; i < 8; i++ ) { - _mm256_stream_si256( (__m256i*)V0 + i*8 + k, casti_m256i( X0, k ) ); - _mm256_stream_si256( (__m256i*)V1 + i*8 + k, casti_m256i( X1, k ) ); + _mm256_stream_si256( (__m256i*)V0 + n*8 + i, casti_m256i( X0, i ) ); + _mm256_stream_si256( (__m256i*)V1 + n*8 + i, casti_m256i( X1, i ) ); } salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { // need 4 J's const uint32_t j0l = 32 * ( (const uint32_t)( X0[16] ) & ( N-1 ) ); @@ -1864,12 +1831,12 @@ void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N ) const uint32_t j1l = 32 * ( (const uint32_t)( X1[16] ) & ( N-1 ) ); const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); - for ( int k = 0; k < 32; k++ ) + for ( int i = 0; i < 32; i++ ) { - X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 ) - | ( V0[ j0l + k ] & 0x00000000ffffffff ) ); - X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 ) - | ( V1[ j1l + k ] & 0x00000000ffffffff ) ); + X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 ) + | ( V0[ j0l + i ] & 0x00000000ffffffff ) ); + X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 ) + | ( V1[ j1l + i ] & 0x00000000ffffffff ) ); } salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); @@ -2025,18 +1992,18 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, uint64_t *V1 = V + 32*N; uint64_t *V2 = V + 64*N; - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V0[i * 32], X0, 2*128 ); - memcpy( &V1[i * 32], X1, 2*128 ); - memcpy( &V2[i * 32], X2, 2*128 ); + memcpy( &V0[ n*32 ], X0, 2*128 ); + memcpy( &V1[ n*32 ], X1, 2*128 ); + memcpy( &V2[ n*32 ], X2, 2*128 ); salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0], &X0[16], &X1[16], &X2[16] ); salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16], &X0[ 0], &X1[ 0], &X2[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { uint32_t j0l = 32 * ( (uint32_t)( X0[16] ) & ( N-1 ) ); uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); @@ -2045,14 +2012,14 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, uint32_t j2l = 32 * ( (uint32_t)( X2[16] ) & ( N-1 ) ); uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) ); - for ( int k = 0; k < 32; k++ ) + for ( int i = 0; i < 32; i++ ) { - X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 ) - | ( V0[ j0l + k ] & 0x00000000ffffffff ) ); - X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 ) - | ( V1[ j1l + k ] & 0x00000000ffffffff ) ); - X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 ) - | ( V2[ j2l + k ] & 0x00000000ffffffff ) ); + X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 ) + | ( V0[ j0l + i ] & 0x00000000ffffffff ) ); + X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 ) + | ( V1[ j1l + i ] & 0x00000000ffffffff ) ); + X2[i] ^= ( ( V2[ j2h + i ] & 0xffffffff00000000 ) + | ( V2[ j2l + i ] & 0x00000000ffffffff ) ); } salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0], &X0[16], &X1[16], &X2[16] ); @@ -2061,229 +2028,6 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, } } -// Working, deprecated -// 8x memory usage -// 2x32 interleaving -static void salsa8_simd128_2way_4buf( uint64_t *BA, uint64_t *BB, - uint64_t *BC, uint64_t *BD, const uint64_t *CA, const uint64_t *CB, - const uint64_t *CC, const uint64_t *CD ) -{ - __m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, - XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3; - __m256i *ba = (__m256i*)BA; - __m256i *bb = (__m256i*)BB; - __m256i *bc = (__m256i*)BC; - __m256i *bd = (__m256i*)BD; - const __m256i *ca = (const __m256i*)CA; - const __m256i *cb = (const __m256i*)CB; - const __m256i *cc = (const __m256i*)CC; - const __m256i *cd = (const __m256i*)CD; - m256_ovly ya[4], yb[4], yc[4], yd[4], - za[4], zb[4], zc[4], zd[4]; - - // mix C into B then shuffle B into X - ba[0] = _mm256_xor_si256( ba[0], ca[0] ); - bb[0] = _mm256_xor_si256( bb[0], cb[0] ); - bc[0] = _mm256_xor_si256( bc[0], cc[0] ); - bd[0] = _mm256_xor_si256( bd[0], cd[0] ); - ba[1] = _mm256_xor_si256( ba[1], ca[1] ); - bb[1] = _mm256_xor_si256( bb[1], cb[1] ); - bc[1] = _mm256_xor_si256( bc[1], cc[1] ); - bd[1] = _mm256_xor_si256( bd[1], cd[1] ); - ba[2] = _mm256_xor_si256( ba[2], ca[2] ); - bb[2] = _mm256_xor_si256( bb[2], cb[2] ); - bc[2] = _mm256_xor_si256( bc[2], cc[2] ); - bd[2] = _mm256_xor_si256( bd[2], cd[2] ); - ba[3] = _mm256_xor_si256( ba[3], ca[3] ); - bb[3] = _mm256_xor_si256( bb[3], cb[3] ); - bc[3] = _mm256_xor_si256( bc[3], cc[3] ); - bd[3] = _mm256_xor_si256( bd[3], cd[3] ); - - XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] ); - XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] ); - XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] ); - XD0 = _mm256_set_epi64x( BD[15], BD[10], BD[ 5], BD[ 0] ); - XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] ); - XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] ); - XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] ); - XD1 = _mm256_set_epi64x( BD[ 3], BD[14], BD[ 9], BD[ 4] ); - XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] ); - XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] ); - XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] ); - XD2 = _mm256_set_epi64x( BD[ 7], BD[ 2], BD[13], BD[ 8] ); - XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] ); - XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] ); - XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] ); - XD3 = _mm256_set_epi64x( BD[11], BD[ 6], BD[ 1], BD[12] ); - - // define targets for macros used in round function template - #define ROL_1X32 mm256_shufll_64 - #define ROR_1X32 mm256_shuflr_64 - #define SWAP_64 mm256_swap_128 - #define ROL32 mm256_rol_32 - #define ADD32 _mm256_add_epi32 - #define XOR _mm256_xor_si256 - #define TYPE __m256i - - SALSA_8ROUNDS_FINAL_SIMD128_4BUF; - - #undef ROL_1X32 - #undef ROR_1X32 - #undef SWAP_64 - #undef ROL32 - #undef ADD32 - #undef XOR - #undef TYPE - - ya[0].m256 = XA0; yb[0].m256 = XB0; - yc[0].m256 = XC0; yd[0].m256 = XD0; - ya[1].m256 = XA1; yb[1].m256 = XB1; - yc[1].m256 = XC1; yd[1].m256 = XD1; - ya[2].m256 = XA2; yb[2].m256 = XB2; - yc[2].m256 = XC2; yd[2].m256 = XD2; - ya[3].m256 = XA3; yb[3].m256 = XB3; - yc[3].m256 = XC3; yd[3].m256 = XD3; - - za[0].u64[0] = ya[0].u64[0]; - zb[0].u64[0] = yb[0].u64[0]; - zc[0].u64[0] = yc[0].u64[0]; - zd[0].u64[0] = yd[0].u64[0]; - za[0].u64[3] = ya[1].u64[0]; - zb[0].u64[3] = yb[1].u64[0]; - zc[0].u64[3] = yc[1].u64[0]; - zd[0].u64[3] = yd[1].u64[0]; - za[0].u64[2] = ya[2].u64[0]; - zb[0].u64[2] = yb[2].u64[0]; - zc[0].u64[2] = yc[2].u64[0]; - zd[0].u64[2] = yd[2].u64[0]; - za[0].u64[1] = ya[3].u64[0]; - zb[0].u64[1] = yb[3].u64[0]; - zc[0].u64[1] = yc[3].u64[0]; - zd[0].u64[1] = yd[3].u64[0]; - - za[1].u64[1] = ya[0].u64[1]; - zb[1].u64[1] = yb[0].u64[1]; - zc[1].u64[1] = yc[0].u64[1]; - zd[1].u64[1] = yd[0].u64[1]; - za[1].u64[0] = ya[1].u64[1]; - zb[1].u64[0] = yb[1].u64[1]; - zc[1].u64[0] = yc[1].u64[1]; - zd[1].u64[0] = yd[1].u64[1]; - za[1].u64[3] = ya[2].u64[1]; - zb[1].u64[3] = yb[2].u64[1]; - zc[1].u64[3] = yc[2].u64[1]; - zd[1].u64[3] = yd[2].u64[1]; - za[1].u64[2] = ya[3].u64[1]; - zb[1].u64[2] = yb[3].u64[1]; - zc[1].u64[2] = yc[3].u64[1]; - zd[1].u64[2] = yd[3].u64[1]; - - za[2].u64[2] = ya[0].u64[2]; - zb[2].u64[2] = yb[0].u64[2]; - zc[2].u64[2] = yc[0].u64[2]; - zd[2].u64[2] = yd[0].u64[2]; - za[2].u64[1] = ya[1].u64[2]; - zb[2].u64[1] = yb[1].u64[2]; - zc[2].u64[1] = yc[1].u64[2]; - zd[2].u64[1] = yd[1].u64[2]; - za[2].u64[0] = ya[2].u64[2]; - zb[2].u64[0] = yb[2].u64[2]; - zc[2].u64[0] = yc[2].u64[2]; - zd[2].u64[0] = yd[2].u64[2]; - za[2].u64[3] = ya[3].u64[2]; - zb[2].u64[3] = yb[3].u64[2]; - zc[2].u64[3] = yc[3].u64[2]; - zd[2].u64[3] = yd[3].u64[2]; - - za[3].u64[3] = ya[0].u64[3]; - zb[3].u64[3] = yb[0].u64[3]; - zc[3].u64[3] = yc[0].u64[3]; - zd[3].u64[3] = yd[0].u64[3]; - za[3].u64[2] = ya[1].u64[3]; - zb[3].u64[2] = yb[1].u64[3]; - zc[3].u64[2] = yc[1].u64[3]; - zd[3].u64[2] = yd[1].u64[3]; - za[3].u64[1] = ya[2].u64[3]; - zb[3].u64[1] = yb[2].u64[3]; - zc[3].u64[1] = yc[2].u64[3]; - zd[3].u64[1] = yd[2].u64[3]; - za[3].u64[0] = ya[3].u64[3]; - zb[3].u64[0] = yb[3].u64[3]; - zc[3].u64[0] = yc[3].u64[3]; - zd[3].u64[0] = yd[3].u64[3]; - - ba[0] = _mm256_add_epi32( ba[0], za[0].m256 ); - bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 ); - bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 ); - bd[0] = _mm256_add_epi32( bd[0], zd[0].m256 ); - ba[1] = _mm256_add_epi32( ba[1], za[1].m256 ); - bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 ); - bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 ); - bd[1] = _mm256_add_epi32( bd[1], zd[1].m256 ); - ba[2] = _mm256_add_epi32( ba[2], za[2].m256 ); - bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 ); - bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 ); - bd[2] = _mm256_add_epi32( bd[2], zd[2].m256 ); - ba[3] = _mm256_add_epi32( ba[3], za[3].m256 ); - bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 ); - bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 ); - bd[3] = _mm256_add_epi32( bd[3], zd[3].m256 ); -} - -void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N ) - -{ - uint64_t *X0 = X; - uint64_t *X1 = X+32; - uint64_t *X2 = X+64; - uint64_t *X3 = X+96; - uint64_t *V0 = V; - uint64_t *V1 = V + 32*N; - uint64_t *V2 = V + 64*N; - uint64_t *V3 = V + 96*N; - - for ( int i = 0; i < N; i++ ) - { - memcpy( &V0[i * 32], X0, 2*128 ); - memcpy( &V1[i * 32], X1, 2*128 ); - memcpy( &V2[i * 32], X2, 2*128 ); - memcpy( &V3[i * 32], X3, 2*128 ); - salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], - &X0[16], &X1[16], &X2[16], &X3[16] ); - salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16], - &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); - } - - for ( int i = 0; i < N; i++ ) - { - // need 4 J's - uint32_t j0l = 32 * ( (uint32_t)( X0[16] ) & ( N-1 ) ); - uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) ); - uint32_t j1l = 32 * ( (uint32_t)( X1[16] ) & ( N-1 ) ); - uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) ); - uint32_t j2l = 32 * ( (uint32_t)( X2[16] ) & ( N-1 ) ); - uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) ); - uint32_t j3l = 32 * ( (uint32_t)( X3[16] ) & ( N-1 ) ); - uint32_t j3h = 32 * ( (uint32_t)( X3[16] >> 32 ) & ( N-1 ) ); - - for ( int k = 0; k < 32; k++ ) - { - X0[k] ^= ( ( V0[ j0h + k ] & 0xffffffff00000000 ) - | ( V0[ j0l + k ] & 0x00000000ffffffff ) ); - X1[k] ^= ( ( V1[ j1h + k ] & 0xffffffff00000000 ) - | ( V1[ j1l + k ] & 0x00000000ffffffff ) ); - X2[k] ^= ( ( V2[ j2h + k ] & 0xffffffff00000000 ) - | ( V2[ j2l + k ] & 0x00000000ffffffff ) ); - X3[k] ^= ( ( V3[ j3h + k ] & 0xffffffff00000000 ) - | ( V3[ j3l + k ] & 0x00000000ffffffff ) ); - } - salsa8_simd128_2way_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], - &X0[16], &X1[16], &X2[16], &X3[16] ); - salsa8_simd128_2way_4buf( &X0[16], &X1[16], &X2[16], &X3[16], - &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); - } -} - #endif // AVX2 @@ -2344,13 +2088,13 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C ) void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 32], X, 128*4 ); + memcpy( &V[ n*32 ], X, 128*4 ); xor_salsa8_4way( &X[ 0], &X[16] ); xor_salsa8_4way( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { m128_ovly *vptr[4]; m128_ovly *x16 = (m128_ovly*)(&X[16]); @@ -2361,12 +2105,12 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] ); } - for ( int k = 0; k < 32; k++ ) + for ( int i = 0; i < 32; i++ ) { m128_ovly v; for ( int l = 0; l < 4; l++ ) - v.u32[l] = ( *(vptr[ l ] + k ) ) .u32[l]; - X[ k ] = _mm_xor_si128( X[ k ], v.m128 ); + v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; + X[i] = _mm_xor_si128( X[i], v.m128 ); } xor_salsa8_4way( &X[ 0], &X[16] ); @@ -2546,19 +2290,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c) void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - for ( int k = 0; k < 8; k++ ) - _mm_stream_si128( (__m128i*)V + i*8 + k, casti_m128i( X, k ) ); + for ( int i = 0; i < 8; i++ ) + _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) ); salsa8_simd128( &X[ 0], &X[16] ); salsa8_simd128( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { const int j = 32 * ( X[16] & ( N - 1 ) ); - for ( int k = 0; k < 32; k++ ) - X[k] ^= V[j + k]; + for ( int i = 0; i < 32; i++ ) + X[i] ^= V[ j+i ]; salsa8_simd128( &X[ 0], &X[16] ); salsa8_simd128( &X[16], &X[ 0] ); } @@ -2566,253 +2310,290 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ) // Double buffered, 2x memory usage // No interleaving -static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, - const uint32_t * const ca, const uint32_t * const cb ) -{ - __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; - __m128i *BA = (__m128i*)ba; - __m128i *BB = (__m128i*)bb; - const __m128i *CA = (const __m128i*)ca; - const __m128i *CB = (const __m128i*)cb; - - // define targets for macros used in round function template - #define ROL_1X32 mm128_shufll_32 - #define ROR_1X32 mm128_shuflr_32 - #define SWAP_64 mm128_swap_64 - #define ROL32 mm128_rol_32 - #define ADD32 _mm_add_epi32 - #define XOR _mm_xor_si128 - #define TYPE __m128i - // mix C into B then shuffle B into X - BA[0] = _mm_xor_si128( BA[0], CA[0] ); - BB[0] = _mm_xor_si128( BB[0], CB[0] ); - BA[1] = _mm_xor_si128( BA[1], CA[1] ); - BB[1] = _mm_xor_si128( BB[1], CB[1] ); - BA[2] = _mm_xor_si128( BA[2], CA[2] ); - BB[2] = _mm_xor_si128( BB[2], CB[2] ); - BA[3] = _mm_xor_si128( BA[3], CA[3] ); - BB[3] = _mm_xor_si128( BB[3], CB[3] ); +static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb ) +{ + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; #if defined(__SSE4_1__) - __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; +// __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3; #if defined(__AVX2__) - - YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 ); - YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 ); - XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 ); - XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 ); - - YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 ); - YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 ); - XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 ); - XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 ); - - YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 ); - YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 ); - XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 ); - XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 ); - - YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 ); - YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 ); - XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 ); - XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 ); - - XA0 = _mm_blend_epi32( XA0, YA0, 0x3 ); - XB0 = _mm_blend_epi32( XB0, YB0, 0x3 ); - XA1 = _mm_blend_epi32( XA1, YA1, 0x3 ); - XB1 = _mm_blend_epi32( XB1, YB1, 0x3 ); + YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 ); + YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 ); + ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 ); + ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 ); - XA2 = _mm_blend_epi32( XA2, YA2, 0x3 ); - XB2 = _mm_blend_epi32( XB2, YB2, 0x3 ); + YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 ); + ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 ); + ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 ); - XA3 = _mm_blend_epi32( XA3, YA3, 0x3 ); - XB3 = _mm_blend_epi32( XB3, YB3, 0x3 ); + YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 ); + YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 ); + ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 ); + ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 ); -#else // SSE4_1 + YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 ); + YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 ); + ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 ); + ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 ); + + XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 ); + XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 ); + + XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 ); + XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 ); + + XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 ); + XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 ); + + XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 ); + XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 ); + +#else - YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 ); - YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 ); - XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 ); - XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 ); +// SSE4.1 - YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 ); - YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 ); - XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 ); - XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 ); + YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 ); + YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 ); + ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 ); + ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 ); - YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 ); - YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 ); - XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 ); - XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 ); + YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 ); + ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 ); + ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 ); - YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 ); - YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 ); - XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 ); - XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 ); + YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 ); + YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 ); + ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 ); + ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 ); - XA0 = _mm_blend_epi16( XA0, YA0, 0x0f ); - XB0 = _mm_blend_epi16( XB0, YB0, 0x0f ); + YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 ); + YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 ); + ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 ); + ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 ); - XA1 = _mm_blend_epi16( XA1, YA1, 0x0f ); - XB1 = _mm_blend_epi16( XB1, YB1, 0x0f ); + XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f ); + XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f ); - XA2 = _mm_blend_epi16( XA2, YA2, 0x0f ); - XB2 = _mm_blend_epi16( XB2, YB2, 0x0f ); + XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f ); + XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f ); - XA3 = _mm_blend_epi16( XA3, YA3, 0x0f ); - XB3 = _mm_blend_epi16( XB3, YB3, 0x0f ); + XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f ); + XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f ); + + XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f ); + XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f ); #endif // AVX2 else SSE4_1 - SALSA_8ROUNDS_SIMD128_2BUF; +#else // SSE2 + + YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); + YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); + YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); + YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); + YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); + YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); + YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] ); + YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] ); + + XA[0] = YA0; + XB[0] = YB0; + XA[1] = YA1; + XB[1] = YB1; + XA[2] = YA2; + XB[2] = YB2; + XA[3] = YA3; + XB[3] = YB3; + +#endif +} + +static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) +{ + + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; #if defined(__AVX2__) - YA0 = _mm_blend_epi32( XA0, XA1, 0x8 ); - YB0 = _mm_blend_epi32( XB0, XB1, 0x8 ); - YA1 = _mm_blend_epi32( XA0, XA1, 0x1 ); - YB1 = _mm_blend_epi32( XB0, XB1, 0x1 ); - YA2 = _mm_blend_epi32( XA0, XA1, 0x2 ); - YB2 = _mm_blend_epi32( XB0, XB1, 0x2 ); - YA3 = _mm_blend_epi32( XA0, XA1, 0x4 ); - YB3 = _mm_blend_epi32( XB0, XB1, 0x4 ); - - YA0 = _mm_blend_epi32( YA0, XA2, 0x4 ); - YB0 = _mm_blend_epi32( YB0, XB2, 0x4 ); - YA1 = _mm_blend_epi32( YA1, XA2, 0x8 ); - YB1 = _mm_blend_epi32( YB1, XB2, 0x8 ); - YA2 = _mm_blend_epi32( YA2, XA2, 0x1 ); - YB2 = _mm_blend_epi32( YB2, XB2, 0x1 ); - YA3 = _mm_blend_epi32( YA3, XA2, 0x2 ); - YB3 = _mm_blend_epi32( YB3, XB2, 0x2 ); - - YA0 = _mm_blend_epi32( YA0, XA3, 0x2 ); - YB0 = _mm_blend_epi32( YB0, XB3, 0x2 ); - YA1 = _mm_blend_epi32( YA1, XA3, 0x4 ); - YB1 = _mm_blend_epi32( YB1, XB3, 0x4 ); - YA2 = _mm_blend_epi32( YA2, XA3, 0x8 ); - YB2 = _mm_blend_epi32( YB2, XB3, 0x8 ); - YA3 = _mm_blend_epi32( YA3, XA3, 0x1 ); - YB3 = _mm_blend_epi32( YB3, XB3, 0x1 ); + YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 ); + YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 ); + YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 ); + YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 ); + YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 ); + YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 ); + YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 ); + + XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 ); + XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 ); + XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 ); + XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 ); + XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 ); + XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 ); + XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 ); + XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 ); -#else // SSE4_1 +#else // SSE4_1 - YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 ); - YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 ); - YA1 = _mm_blend_epi16( XA0, XA1, 0x03 ); - YB1 = _mm_blend_epi16( XB0, XB1, 0x03 ); - YA2 = _mm_blend_epi16( XA0, XA1, 0x0c ); - YB2 = _mm_blend_epi16( XB0, XB1, 0x0c ); - YA3 = _mm_blend_epi16( XA0, XA1, 0x30 ); - YB3 = _mm_blend_epi16( XB0, XB1, 0x30 ); - - YA0 = _mm_blend_epi16( YA0, XA2, 0x30 ); - YB0 = _mm_blend_epi16( YB0, XB2, 0x30 ); - YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 ); - YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 ); - YA2 = _mm_blend_epi16( YA2, XA2, 0x03 ); - YB2 = _mm_blend_epi16( YB2, XB2, 0x03 ); - YA3 = _mm_blend_epi16( YA3, XA2, 0x0c ); - YB3 = _mm_blend_epi16( YB3, XB2, 0x0c ); - - YA0 = _mm_blend_epi16( YA0, XA3, 0x0c ); - YB0 = _mm_blend_epi16( YB0, XB3, 0x0c ); - YA1 = _mm_blend_epi16( YA1, XA3, 0x30 ); - YB1 = _mm_blend_epi16( YB1, XB3, 0x30 ); - YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 ); - YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 ); - YA3 = _mm_blend_epi16( YA3, XA3, 0x03 ); - YB3 = _mm_blend_epi16( YB3, XB3, 0x03 ); + YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 ); + YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 ); + YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 ); + YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c ); + YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c ); + YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 ); + YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c ); + + XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c ); + XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c ); + XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 ); + XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 ); + XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 ); + XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 ); + XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 ); + XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 ); -#endif // AVX2 else SSE4_1 - - BA[0] = _mm_add_epi32( BA[0], YA0 ); - BB[0] = _mm_add_epi32( BB[0], YB0 ); - BA[1] = _mm_add_epi32( BA[1], YA1 ); - BB[1] = _mm_add_epi32( BB[1], YB1 ); - BA[2] = _mm_add_epi32( BA[2], YA2 ); - BB[2] = _mm_add_epi32( BB[2], YB2 ); - BA[3] = _mm_add_epi32( BA[3], YA3 ); - BB[3] = _mm_add_epi32( BB[3], YB3 ); +#endif // AVX2 else SSE4_1 #else // SSE2 m128_ovly ya[4], za[4], yb[4], zb[4]; - XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] ); - XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] ); - XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] ); - XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] ); - XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] ); - XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] ); - XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] ); - XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] ); - - SALSA_8ROUNDS_FINAL_SIMD128_2BUF; - - // Final round doesn't shuffle data back to original input order, - // process it as is. - - ya[0].m128 = XA0; - yb[0].m128 = XB0; - ya[1].m128 = XA1; - yb[1].m128 = XB1; - ya[2].m128 = XA2; - yb[2].m128 = XB2; - ya[3].m128 = XA3; - yb[3].m128 = XB3; + ya[0].m128 = XA[0]; + yb[0].m128 = XB[0]; + ya[1].m128 = XA[1]; + yb[1].m128 = XB[1]; + ya[2].m128 = XA[2]; + yb[2].m128 = XB[2]; + ya[3].m128 = XA[3]; + yb[3].m128 = XB[3]; za[0].u32[0] = ya[0].u32[0]; zb[0].u32[0] = yb[0].u32[0]; - za[0].u32[3] = ya[1].u32[0]; - zb[0].u32[3] = yb[1].u32[0]; - za[0].u32[2] = ya[2].u32[0]; - zb[0].u32[2] = yb[2].u32[0]; - za[0].u32[1] = ya[3].u32[0]; - zb[0].u32[1] = yb[3].u32[0]; - + za[0].u32[1] = ya[3].u32[1]; + zb[0].u32[1] = yb[3].u32[1]; + za[0].u32[2] = ya[2].u32[2]; + zb[0].u32[2] = yb[2].u32[2]; + za[0].u32[3] = ya[1].u32[3]; + zb[0].u32[3] = yb[1].u32[3]; + + za[1].u32[0] = ya[1].u32[0]; + zb[1].u32[0] = yb[1].u32[0]; za[1].u32[1] = ya[0].u32[1]; zb[1].u32[1] = yb[0].u32[1]; - za[1].u32[0] = ya[1].u32[1]; - zb[1].u32[0] = yb[1].u32[1]; - za[1].u32[3] = ya[2].u32[1]; - zb[1].u32[3] = yb[2].u32[1]; - za[1].u32[2] = ya[3].u32[1]; - zb[1].u32[2] = yb[3].u32[1]; - + za[1].u32[2] = ya[3].u32[2]; + zb[1].u32[2] = yb[3].u32[2]; + za[1].u32[3] = ya[2].u32[3]; + zb[1].u32[3] = yb[2].u32[3]; + + za[2].u32[0] = ya[2].u32[0]; + zb[2].u32[0] = yb[2].u32[0]; + za[2].u32[1] = ya[1].u32[1]; + zb[2].u32[1] = yb[1].u32[1]; za[2].u32[2] = ya[0].u32[2]; zb[2].u32[2] = yb[0].u32[2]; - za[2].u32[1] = ya[1].u32[2]; - zb[2].u32[1] = yb[1].u32[2]; - za[2].u32[0] = ya[2].u32[2]; - zb[2].u32[0] = yb[2].u32[2]; - za[2].u32[3] = ya[3].u32[2]; - zb[2].u32[3] = yb[3].u32[2]; - + za[2].u32[3] = ya[3].u32[3]; + zb[2].u32[3] = yb[3].u32[3]; + + za[3].u32[0] = ya[3].u32[0]; + zb[3].u32[0] = yb[3].u32[0]; + za[3].u32[1] = ya[2].u32[1]; + zb[3].u32[1] = yb[2].u32[1]; + za[3].u32[2] = ya[1].u32[2]; + zb[3].u32[2] = yb[1].u32[2]; za[3].u32[3] = ya[0].u32[3]; zb[3].u32[3] = yb[0].u32[3]; - za[3].u32[2] = ya[1].u32[3]; - zb[3].u32[2] = yb[1].u32[3]; - za[3].u32[1] = ya[2].u32[3]; - zb[3].u32[1] = yb[2].u32[3]; - za[3].u32[0] = ya[3].u32[3]; - zb[3].u32[0] = yb[3].u32[3]; - - BA[0] = _mm_add_epi32( BA[0], za[0].m128 ); - BB[0] = _mm_add_epi32( BB[0], zb[0].m128 ); - BA[1] = _mm_add_epi32( BA[1], za[1].m128 ); - BB[1] = _mm_add_epi32( BB[1], zb[1].m128 ); - BA[2] = _mm_add_epi32( BA[2], za[2].m128 ); - BB[2] = _mm_add_epi32( BB[2], zb[2].m128 ); - BA[3] = _mm_add_epi32( BA[3], za[3].m128 ); - BB[3] = _mm_add_epi32( BB[3], zb[3].m128 ); + XA[0] = za[0].m128; + XB[0] = zb[0].m128; + XA[1] = za[1].m128; + XB[1] = zb[1].m128; + XA[2] = za[2].m128; + XB[2] = zb[2].m128; + XA[3] = za[3].m128; + XB[3] = zb[3].m128; + +#endif +} + +static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, + const uint32_t * const ca, const uint32_t * const cb ) +{ + __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + __m128i *BA = (__m128i*)ba; + __m128i *BB = (__m128i*)bb; + const __m128i *CA = (const __m128i*)ca; + const __m128i *CB = (const __m128i*)cb; + + // define targets for macros used in round function template + #define ROL_1X32 mm128_shufll_32 + #define ROR_1X32 mm128_shuflr_32 + #define SWAP_64 mm128_swap_64 + #define ROL32 mm128_rol_32 + #define ADD32 _mm_add_epi32 + #define XOR _mm_xor_si128 + #define TYPE __m128i + + XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] ); + XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] ); + XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] ); + XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] ); + XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] ); + XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] ); + XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] ); + XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] ); + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + SALSA_8ROUNDS_SIMD128_2BUF; + +#else + + SALSA_8ROUNDS_SIMD128_2BUF_SLOROT; + #endif + BA[0] = _mm_add_epi32( BA[0], XA0 ); + BB[0] = _mm_add_epi32( BB[0], XB0 ); + BA[1] = _mm_add_epi32( BA[1], XA1 ); + BB[1] = _mm_add_epi32( BB[1], XB1 ); + BA[2] = _mm_add_epi32( BA[2], XA2 ); + BB[2] = _mm_add_epi32( BB[2], XB2 ); + BA[3] = _mm_add_epi32( BA[3], XA3 ); + BB[3] = _mm_add_epi32( BB[3], XB3 ); + #undef ROL_1X32 #undef ROR_1X32 #undef SWAP_64 @@ -2822,570 +2603,425 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, #undef TYPE } - -// X: 2 sequential buffers -// V: 2 sequential buffers interleaved by the size of N -// interleaved buffers { v00, v01, v10, v11, v20... } -// void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ) { - uint32_t *X0 = X; - uint32_t *X1 = X+32; - uint32_t *V0 = V; - uint32_t *V1 = V + 32*N; + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; - for ( int i = 0; i < N; i++ ) + salsa_simd128_shuffle_2buf( X0, X1 ); + salsa_simd128_shuffle_2buf( X0+16, X1+16 ); + + for ( int n = 0; n < N; n++ ) { - #if defined(__AVX2__) + #if defined(__AVX__) + + for ( int i = 0; i < 4; i++ ) + { + _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) ); + _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) ); + } - for ( int k = 0; k < 4; k++ ) + #elif defined(__SSE4_1__) + + for ( int i = 0; i < 8; i++ ) { - _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) ); - _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) ); + _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) ); + _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) ); } #else - memcpy( &V0[ i*32 ], X0, 128 ); - memcpy( &V1[ i*32 ], X1, 128 ); + memcpy( &V0[ n*32 ], X0, 128 ); + memcpy( &V1[ n*32 ], X1, 128 ); #endif - salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); - salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + salsa8_simd128_2buf( X0, X1, X0+16, X1+16 ); + salsa8_simd128_2buf( X0+16, X1+16, X0 , X1 ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { #if defined(__AVX2__) const int j0 = 4 * ( X0[16] & ( N-1 ) ); const int j1 = 4 * ( X1[16] & ( N-1 ) ); - for ( int k = 0; k < 4; k++ ) - { - const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k ); - const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); -// const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k ); -// const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k ); - casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 ); - casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 ); - } + + const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0 ); + const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1 ); + const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 ); + const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 ); + const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 ); + const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 ); + const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 ); + const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 ); + + casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 ); + casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 ); + casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 ); + casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 ); + casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 ); + casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 ); + casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 ); + casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 ); #else const int j0 = 8 * ( X0[16] & ( N-1 ) ); const int j1 = 8 * ( X1[16] & ( N-1 ) ); - for ( int k = 0; k < 8; k++ ) + for ( int i = 0; i < 8; i++ ) { - const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k ); - const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k ); - casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 ); - casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 ); + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i ); + casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 ); + casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 ); } #endif -/* - const int j0 = 16 * ( X0[16] & ( N - 1 ) ); - const int j1 = 16 * ( X1[16] & ( N - 1 ) ); - - for ( int k = 0; k < 16; k++ ) - { - const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ]; - const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ]; - ( (uint64_t*)X0 )[k] ^= v0; - ( (uint64_t*)X1 )[k] ^= v1; - } -*/ - -/* - const int j0 = 32 * ( X0[16] & ( N-1 ) ); - const int j1 = 32 * ( X1[16] & ( N-1 ) ); - - for ( int k = 0; k < 32; k++ ) - { - const uint32_t v0 = V0[ j0+k ]; - const uint32_t v1 = V1[ j1+k ]; - X0[k] ^= v0; - X1[k] ^= v1; - } -*/ - - salsa8_simd128_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] ); - salsa8_simd128_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] ); + salsa8_simd128_2buf( X0, X1, X0+16, X1+16 ); + salsa8_simd128_2buf( X0+16, X1+16, X0 , X1 ); } + + salsa_simd128_unshuffle_2buf( X0, X1 ); + salsa_simd128_unshuffle_2buf( X0+16, X1+16 ); } -// Triple buffered, 3x memory usage -// No interleaving -static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, - const uint32_t *ca, const uint32_t *cb, const uint32_t *cc ) +static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb, + uint32_t *xc ) { - __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, - XC0, XC1, XC2, XC3; - __m128i *BA = (__m128i*)ba; - __m128i *BB = (__m128i*)bb; - __m128i *BC = (__m128i*)bc; - const __m128i *CA = (const __m128i*)ca; - const __m128i *CB = (const __m128i*)cb; - const __m128i *CC = (const __m128i*)cc; - - // define targets for macros used in round function template - #define ROL_1X32 mm128_shufll_32 - #define ROR_1X32 mm128_shuflr_32 - #define SWAP_64 mm128_swap_64 - #define ROL32 mm128_rol_32 - #define ADD32 _mm_add_epi32 - #define XOR _mm_xor_si128 - #define TYPE __m128i - - // mix C into B then shuffle B into X - BA[0] = _mm_xor_si128( BA[0], CA[0] ); - BB[0] = _mm_xor_si128( BB[0], CB[0] ); - BC[0] = _mm_xor_si128( BC[0], CC[0] ); - BA[1] = _mm_xor_si128( BA[1], CA[1] ); - BB[1] = _mm_xor_si128( BB[1], CB[1] ); - BC[1] = _mm_xor_si128( BC[1], CC[1] ); - BA[2] = _mm_xor_si128( BA[2], CA[2] ); - BB[2] = _mm_xor_si128( BB[2], CB[2] ); - BC[2] = _mm_xor_si128( BC[2], CC[2] ); - BA[3] = _mm_xor_si128( BA[3], CA[3] ); - BB[3] = _mm_xor_si128( BB[3], CB[3] ); - BC[3] = _mm_xor_si128( BC[3], CC[3] ); + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + __m128i *XC = (__m128i*)xc; + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; #if defined(__SSE4_1__) - __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + __m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3; #if defined(__AVX2__) - YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 ); - YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 ); - YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 ); - XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 ); - XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 ); - XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 ); - - YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 ); - YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 ); - YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 ); - XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 ); - XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 ); - XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 ); - - YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 ); - YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 ); - YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 ); - XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 ); - XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 ); - XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 ); - - YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 ); - YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 ); - YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 ); - XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 ); - XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 ); - XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 ); - - XA0 = _mm_blend_epi32( XA0, YA0, 0x3 ); - XB0 = _mm_blend_epi32( XB0, YB0, 0x3 ); - XC0 = _mm_blend_epi32( XC0, YC0, 0x3 ); - - XA1 = _mm_blend_epi32( XA1, YA1, 0x3 ); - XB1 = _mm_blend_epi32( XB1, YB1, 0x3 ); - XC1 = _mm_blend_epi32( XC1, YC1, 0x3 ); - - XA2 = _mm_blend_epi32( XA2, YA2, 0x3 ); - XB2 = _mm_blend_epi32( XB2, YB2, 0x3 ); - XC2 = _mm_blend_epi32( XC2, YC2, 0x3 ); - - XA3 = _mm_blend_epi32( XA3, YA3, 0x3 ); - XB3 = _mm_blend_epi32( XB3, YB3, 0x3 ); - XC3 = _mm_blend_epi32( XC3, YC3, 0x3 ); + YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 ); + YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 ); + YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 ); + ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 ); + ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 ); + ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 ); + + YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 ); + YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 ); + ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 ); + ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 ); + ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 ); + + YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 ); + YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 ); + YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 ); + ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 ); + ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 ); + ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 ); + + YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 ); + YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 ); + YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 ); + ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 ); + ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 ); + ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 ); + + XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 ); + XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 ); + XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 ); + + XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 ); + XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 ); + XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 ); + + XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 ); + XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 ); + XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 ); + + XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 ); + XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 ); + XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 ); + +#else + +// SSE4.1 + + YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 ); + YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 ); + YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 ); + ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 ); + ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 ); + ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 ); + + YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 ); + YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 ); + ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 ); + ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 ); + ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 ); + + YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 ); + YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 ); + YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 ); + ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 ); + ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 ); + ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 ); + + YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 ); + YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 ); + YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 ); + ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 ); + ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 ); + ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 ); + + XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f ); + XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f ); + XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f ); + + XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f ); + XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f ); + XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f ); + + XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f ); + XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f ); + XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f ); + + XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f ); + XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f ); + XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f ); -#else // SSE4_1 +#endif // AVX2 else SSE4_1 - YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 ); - YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 ); - YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 ); - XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 ); - XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 ); - XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 ); - XA0 = _mm_blend_epi16( XA0, YA0, 0x0f ); - XB0 = _mm_blend_epi16( XB0, YB0, 0x0f ); - XC0 = _mm_blend_epi16( XC0, YC0, 0x0f ); - - YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 ); - YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 ); - YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 ); - XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 ); - XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 ); - XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 ); - XA1 = _mm_blend_epi16( XA1, YA1, 0x0f ); - XB1 = _mm_blend_epi16( XB1, YB1, 0x0f ); - XC1 = _mm_blend_epi16( XC1, YC1, 0x0f ); - - YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 ); - YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 ); - YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 ); - XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 ); - XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 ); - XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 ); - XA2 = _mm_blend_epi16( XA2, YA2, 0x0f ); - XB2 = _mm_blend_epi16( XB2, YB2, 0x0f ); - XC2 = _mm_blend_epi16( XC2, YC2, 0x0f ); - - YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 ); - YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 ); - YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 ); - XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 ); - XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 ); - XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 ); - XA3 = _mm_blend_epi16( XA3, YA3, 0x0f ); - XB3 = _mm_blend_epi16( XB3, YB3, 0x0f ); - XC3 = _mm_blend_epi16( XC3, YC3, 0x0f ); - -#endif // AVX2 else SSE3_1 +#else // SSE2 + + YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); + YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); + YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] ); + YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); + YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); + YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] ); + YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); + YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); + YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] ); + YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] ); + YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] ); + YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] ); + + XA[0] = YA0; + XB[0] = YB0; + XC[0] = YC0; + XA[1] = YA1; + XB[1] = YB1; + XC[1] = YC1; + XA[2] = YA2; + XB[2] = YB2; + XC[2] = YC2; + XA[3] = YA3; + XB[3] = YB3; + XC[3] = YC3; - SALSA_8ROUNDS_SIMD128_3BUF; +#endif +} + +static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, + uint32_t* xc ) +{ + __m128i *XA = (__m128i*)xa; + __m128i *XB = (__m128i*)xb; + __m128i *XC = (__m128i*)xc; + +#if defined(__SSE4_1__) + + __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; #if defined(__AVX2__) - YA0 = _mm_blend_epi32( XA0, XA1, 0x8 ); - YB0 = _mm_blend_epi32( XB0, XB1, 0x8 ); - YC0 = _mm_blend_epi32( XC0, XC1, 0x8 ); - YA1 = _mm_blend_epi32( XA0, XA1, 0x1 ); - YB1 = _mm_blend_epi32( XB0, XB1, 0x1 ); - YC1 = _mm_blend_epi32( XC0, XC1, 0x1 ); - YA2 = _mm_blend_epi32( XA0, XA1, 0x2 ); - YB2 = _mm_blend_epi32( XB0, XB1, 0x2 ); - YC2 = _mm_blend_epi32( XC0, XC1, 0x2 ); - YA3 = _mm_blend_epi32( XA0, XA1, 0x4 ); - YB3 = _mm_blend_epi32( XB0, XB1, 0x4 ); - YC3 = _mm_blend_epi32( XC0, XC1, 0x4 ); - - YA0 = _mm_blend_epi32( YA0, XA2, 0x4 ); - YB0 = _mm_blend_epi32( YB0, XB2, 0x4 ); - YC0 = _mm_blend_epi32( YC0, XC2, 0x4 ); - YA1 = _mm_blend_epi32( YA1, XA2, 0x8 ); - YB1 = _mm_blend_epi32( YB1, XB2, 0x8 ); - YC1 = _mm_blend_epi32( YC1, XC2, 0x8 ); - YA2 = _mm_blend_epi32( YA2, XA2, 0x1 ); - YB2 = _mm_blend_epi32( YB2, XB2, 0x1 ); - YC2 = _mm_blend_epi32( YC2, XC2, 0x1 ); - YA3 = _mm_blend_epi32( YA3, XA2, 0x2 ); - YB3 = _mm_blend_epi32( YB3, XB2, 0x2 ); - YC3 = _mm_blend_epi32( YC3, XC2, 0x2 ); - - YA0 = _mm_blend_epi32( YA0, XA3, 0x2 ); - YB0 = _mm_blend_epi32( YB0, XB3, 0x2 ); - YC0 = _mm_blend_epi32( YC0, XC3, 0x2 ); - YA1 = _mm_blend_epi32( YA1, XA3, 0x4 ); - YB1 = _mm_blend_epi32( YB1, XB3, 0x4 ); - YC1 = _mm_blend_epi32( YC1, XC3, 0x4 ); - YA2 = _mm_blend_epi32( YA2, XA3, 0x8 ); - YB2 = _mm_blend_epi32( YB2, XB3, 0x8 ); - YC2 = _mm_blend_epi32( YC2, XC3, 0x8 ); - YA3 = _mm_blend_epi32( YA3, XA3, 0x1 ); - YB3 = _mm_blend_epi32( YB3, XB3, 0x1 ); - YC3 = _mm_blend_epi32( YC3, XC3, 0x1 ); + YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 ); + YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 ); + YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 ); + YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 ); + YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 ); + YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 ); + YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 ); + YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 ); + YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 ); + YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 ); + YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 ); + YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 ); + + YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 ); + YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 ); + YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 ); + YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 ); + YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 ); + YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 ); + YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 ); + YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 ); + YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 ); + YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 ); + YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 ); + YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 ); + + XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 ); + XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 ); + XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 ); + XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 ); + XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 ); + XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 ); + XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 ); + XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 ); + XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 ); + XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 ); + XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 ); + XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 ); #else // SSE4_1 - YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 ); - YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 ); - YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 ); - YA1 = _mm_blend_epi16( XA0, XA1, 0x03 ); - YB1 = _mm_blend_epi16( XB0, XB1, 0x03 ); - YC1 = _mm_blend_epi16( XC0, XC1, 0x03 ); - YA2 = _mm_blend_epi16( XA0, XA1, 0x0c ); - YB2 = _mm_blend_epi16( XB0, XB1, 0x0c ); - YC2 = _mm_blend_epi16( XC0, XC1, 0x0c ); - YA3 = _mm_blend_epi16( XA0, XA1, 0x30 ); - YB3 = _mm_blend_epi16( XB0, XB1, 0x30 ); - YC3 = _mm_blend_epi16( XC0, XC1, 0x30 ); - - YA0 = _mm_blend_epi16( YA0, XA2, 0x30 ); - YB0 = _mm_blend_epi16( YB0, XB2, 0x30 ); - YC0 = _mm_blend_epi16( YC0, XC2, 0x30 ); - YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 ); - YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 ); - YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 ); - YA2 = _mm_blend_epi16( YA2, XA2, 0x03 ); - YB2 = _mm_blend_epi16( YB2, XB2, 0x03 ); - YC2 = _mm_blend_epi16( YC2, XC2, 0x03 ); - YA3 = _mm_blend_epi16( YA3, XA2, 0x0c ); - YB3 = _mm_blend_epi16( YB3, XB2, 0x0c ); - YC3 = _mm_blend_epi16( YC3, XC2, 0x0c ); - - YA0 = _mm_blend_epi16( YA0, XA3, 0x0c ); - YB0 = _mm_blend_epi16( YB0, XB3, 0x0c ); - YC0 = _mm_blend_epi16( YC0, XC3, 0x0c ); - YA1 = _mm_blend_epi16( YA1, XA3, 0x30 ); - YB1 = _mm_blend_epi16( YB1, XB3, 0x30 ); - YC1 = _mm_blend_epi16( YC1, XC3, 0x30 ); - YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 ); - YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 ); - YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 ); - YA3 = _mm_blend_epi16( YA3, XA3, 0x03 ); - YB3 = _mm_blend_epi16( YB3, XB3, 0x03 ); - YC3 = _mm_blend_epi16( YC3, XC3, 0x03 ); + YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 ); + YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 ); + YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 ); + YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 ); + YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 ); + YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 ); + YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c ); + YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c ); + YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c ); + YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 ); + YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 ); + YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 ); + + YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 ); + YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 ); + YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 ); + YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 ); + YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 ); + YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 ); + YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 ); + YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 ); + YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 ); + YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c ); + YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c ); + YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c ); + + XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c ); + XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c ); + XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c ); + XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 ); + XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 ); + XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 ); + XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 ); + XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 ); + XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 ); + XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 ); + XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 ); + XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 ); #endif // AVX2 else SSE4_1 - BA[0] = _mm_add_epi32( BA[0], YA0 ); - BB[0] = _mm_add_epi32( BB[0], YB0 ); - BC[0] = _mm_add_epi32( BC[0], YC0 ); - BA[1] = _mm_add_epi32( BA[1], YA1 ); - BB[1] = _mm_add_epi32( BB[1], YB1 ); - BC[1] = _mm_add_epi32( BC[1], YC1 ); - BA[2] = _mm_add_epi32( BA[2], YA2 ); - BB[2] = _mm_add_epi32( BB[2], YB2 ); - BC[2] = _mm_add_epi32( BC[2], YC2 ); - BA[3] = _mm_add_epi32( BA[3], YA3 ); - BB[3] = _mm_add_epi32( BB[3], YB3 ); - BC[3] = _mm_add_epi32( BC[3], YC3 ); - #else // SSE2 - m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4]; - - XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] ); - XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] ); - XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] ); - XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] ); - XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] ); - XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] ); - XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] ); - XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] ); - XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] ); - XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] ); - XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] ); - XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] ); - - SALSA_8ROUNDS_FINAL_SIMD128_3BUF; - - // Final round doesn't shuffle data back to original input order, - // process it as is. - - ya[0].m128 = XA0; - yb[0].m128 = XB0; - yc[0].m128 = XC0; - ya[1].m128 = XA1; - yb[1].m128 = XB1; - yc[1].m128 = XC1; - ya[2].m128 = XA2; - yb[2].m128 = XB2; - yc[2].m128 = XC2; - ya[3].m128 = XA3; - yb[3].m128 = XB3; - yc[3].m128 = XC3; + m128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4]; + + ya[0].m128 = XA[0]; + yb[0].m128 = XB[0]; + yc[0].m128 = XC[0]; + ya[1].m128 = XA[1]; + yb[1].m128 = XB[1]; + yc[1].m128 = XC[1]; + ya[2].m128 = XA[2]; + yb[2].m128 = XB[2]; + yc[2].m128 = XC[2]; + ya[3].m128 = XA[3]; + yb[3].m128 = XB[3]; + yc[3].m128 = XC[3]; za[0].u32[0] = ya[0].u32[0]; zb[0].u32[0] = yb[0].u32[0]; zc[0].u32[0] = yc[0].u32[0]; - za[0].u32[3] = ya[1].u32[0]; - zb[0].u32[3] = yb[1].u32[0]; - zc[0].u32[3] = yc[1].u32[0]; - za[0].u32[2] = ya[2].u32[0]; - zb[0].u32[2] = yb[2].u32[0]; - zc[0].u32[2] = yc[2].u32[0]; - za[0].u32[1] = ya[3].u32[0]; - zb[0].u32[1] = yb[3].u32[0]; - zc[0].u32[1] = yc[3].u32[0]; - + za[0].u32[1] = ya[3].u32[1]; + zb[0].u32[1] = yb[3].u32[1]; + zc[0].u32[1] = yc[3].u32[1]; + za[0].u32[2] = ya[2].u32[2]; + zb[0].u32[2] = yb[2].u32[2]; + zc[0].u32[2] = yc[2].u32[2]; + za[0].u32[3] = ya[1].u32[3]; + zb[0].u32[3] = yb[1].u32[3]; + zc[0].u32[3] = yc[1].u32[3]; + + za[1].u32[0] = ya[1].u32[0]; + zb[1].u32[0] = yb[1].u32[0]; + zc[1].u32[0] = yc[1].u32[0]; za[1].u32[1] = ya[0].u32[1]; zb[1].u32[1] = yb[0].u32[1]; zc[1].u32[1] = yc[0].u32[1]; - za[1].u32[0] = ya[1].u32[1]; - zb[1].u32[0] = yb[1].u32[1]; - zc[1].u32[0] = yc[1].u32[1]; - za[1].u32[3] = ya[2].u32[1]; - zb[1].u32[3] = yb[2].u32[1]; - zc[1].u32[3] = yc[2].u32[1]; - za[1].u32[2] = ya[3].u32[1]; - zb[1].u32[2] = yb[3].u32[1]; - zc[1].u32[2] = yc[3].u32[1]; - + za[1].u32[2] = ya[3].u32[2]; + zb[1].u32[2] = yb[3].u32[2]; + zc[1].u32[2] = yc[3].u32[2]; + za[1].u32[3] = ya[2].u32[3]; + zb[1].u32[3] = yb[2].u32[3]; + zc[1].u32[3] = yc[2].u32[3]; + + za[2].u32[0] = ya[2].u32[0]; + zb[2].u32[0] = yb[2].u32[0]; + zc[2].u32[0] = yc[2].u32[0]; + za[2].u32[1] = ya[1].u32[1]; + zb[2].u32[1] = yb[1].u32[1]; + zc[2].u32[1] = yc[1].u32[1]; za[2].u32[2] = ya[0].u32[2]; zb[2].u32[2] = yb[0].u32[2]; zc[2].u32[2] = yc[0].u32[2]; - za[2].u32[1] = ya[1].u32[2]; - zb[2].u32[1] = yb[1].u32[2]; - zc[2].u32[1] = yc[1].u32[2]; - za[2].u32[0] = ya[2].u32[2]; - zb[2].u32[0] = yb[2].u32[2]; - zc[2].u32[0] = yc[2].u32[2]; - za[2].u32[3] = ya[3].u32[2]; - zb[2].u32[3] = yb[3].u32[2]; - zc[2].u32[3] = yc[3].u32[2]; - + za[2].u32[3] = ya[3].u32[3]; + zb[2].u32[3] = yb[3].u32[3]; + zc[2].u32[3] = yc[3].u32[3]; + + za[3].u32[0] = ya[3].u32[0]; + zb[3].u32[0] = yb[3].u32[0]; + zc[3].u32[0] = yc[3].u32[0]; + za[3].u32[1] = ya[2].u32[1]; + zb[3].u32[1] = yb[2].u32[1]; + zc[3].u32[1] = yc[2].u32[1]; + za[3].u32[2] = ya[1].u32[2]; + zb[3].u32[2] = yb[1].u32[2]; + zc[3].u32[2] = yc[1].u32[2]; za[3].u32[3] = ya[0].u32[3]; zb[3].u32[3] = yb[0].u32[3]; zc[3].u32[3] = yc[0].u32[3]; - za[3].u32[2] = ya[1].u32[3]; - zb[3].u32[2] = yb[1].u32[3]; - zc[3].u32[2] = yc[1].u32[3]; - za[3].u32[1] = ya[2].u32[3]; - zb[3].u32[1] = yb[2].u32[3]; - zc[3].u32[1] = yc[2].u32[3]; - za[3].u32[0] = ya[3].u32[3]; - zb[3].u32[0] = yb[3].u32[3]; - zc[3].u32[0] = yc[3].u32[3]; - - BA[0] = _mm_add_epi32( BA[0], za[0].m128 ); - BB[0] = _mm_add_epi32( BB[0], zb[0].m128 ); - BC[0] = _mm_add_epi32( BC[0], zc[0].m128 ); - BA[1] = _mm_add_epi32( BA[1], za[1].m128 ); - BB[1] = _mm_add_epi32( BB[1], zb[1].m128 ); - BC[1] = _mm_add_epi32( BC[1], zc[1].m128 ); - BA[2] = _mm_add_epi32( BA[2], za[2].m128 ); - BB[2] = _mm_add_epi32( BB[2], zb[2].m128 ); - BC[2] = _mm_add_epi32( BC[2], zc[2].m128 ); - BA[3] = _mm_add_epi32( BA[3], za[3].m128 ); - BB[3] = _mm_add_epi32( BB[3], zb[3].m128 ); - BC[3] = _mm_add_epi32( BC[3], zc[3].m128 ); - -#endif - - #undef ROL_1X32 - #undef ROR_1X32 - #undef SWAP_64 - #undef ROL32 - #undef ADD32 - #undef XOR - #undef TYPE -} - -void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ) -{ - uint32_t *X0 = X; - uint32_t *X1 = X+32; - uint32_t *X2 = X+64; - uint32_t *V0 = V; - uint32_t *V1 = V + 32*N; - uint32_t *V2 = V + 64*N; - - for ( int i = 0; i < N; i++ ) - { - #if defined(__AVX2__) - - for ( int k = 0; k < 4; k++ ) - { - _mm256_stream_si256( (__m256i*)V0 + i*4 + k, casti_m256i( X0, k ) ); - _mm256_stream_si256( (__m256i*)V1 + i*4 + k, casti_m256i( X1, k ) ); - _mm256_stream_si256( (__m256i*)V2 + i*4 + k, casti_m256i( X2, k ) ); - } - - #else - - memcpy( &V0[ i*32 ], X0, 128 ); - memcpy( &V1[ i*32 ], X1, 128 ); - memcpy( &V2[ i*32 ], X2, 128 ); - - #endif - - salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0], - &X0[16], &X1[16], &X2[16] ); - salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16], - &X0[ 0], &X1[ 0], &X2[ 0] ); - } - - for ( int i = 0; i < N; i++ ) - { - #if defined(__AVX2__) - - const int j0 = 4 * ( X0[16] & ( N - 1 ) ); - const int j1 = 4 * ( X1[16] & ( N - 1 ) ); - const int j2 = 4 * ( X2[16] & ( N - 1 ) ); - - for ( int k = 0; k < 4; k++ ) - { - const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k ); - const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); - const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k ); -// const __m256i v0 = _mm256_load_si256( ( (__m256i*)V0 ) +j0+k ); -// const __m256i v1 = _mm256_load_si256( ( (__m256i*)V1 ) +j1+k ); -// const __m256i v2 = _mm256_load_si256( ( (__m256i*)V2 ) +j2+k ); - casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 ); - casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 ); - casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 ); - } - - #else - - const int j0 = 8 * ( X0[16] & ( N - 1 ) ); - const int j1 = 8 * ( X1[16] & ( N - 1 ) ); - const int j2 = 8 * ( X2[16] & ( N - 1 ) ); - for ( int k = 0; k < 8; k++ ) - { - const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k ); - const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k ); - const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k ); - casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 ); - casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 ); - casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 ); - } - - #endif - -/* - const int j0 = 16 * ( X0[16] & ( N - 1 ) ); - const int j1 = 16 * ( X1[16] & ( N - 1 ) ); - const int j2 = 16 * ( X2[16] & ( N - 1 ) ); - - for ( int k = 0; k < 16; k++ ) - { - const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ]; - const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ]; - const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ]; - ( (uint64_t*)X0 )[k] ^= v0; - ( (uint64_t*)X1 )[k] ^= v1; - ( (uint64_t*)X2 )[k] ^= v2; - } -*/ - -/* - const int j0 = 32 * ( X0[16] & ( N - 1 ) ); - const int j1 = 32 * ( X1[16] & ( N - 1 ) ); - const int j2 = 32 * ( X2[16] & ( N - 1 ) ); - for ( int k = 0; k < 32; k++ ) - { - const uint32_t v0 = V0[ j0+k ]; - const uint32_t v1 = V1[ j1+k ]; - const uint32_t v2 = V2[ j2+k ]; - X0[k] ^= v0; - X1[k] ^= v1; - X2[k] ^= v2; - } -*/ - - salsa8_simd128_3buf( &X0[ 0], &X1[ 0], &X2[ 0], - &X0[16], &X1[16], &X2[16] ); - salsa8_simd128_3buf( &X0[16], &X1[16], &X2[16], - &X0[ 0], &X1[ 0], &X2[ 0] ); - } -} + XA[0] = za[0].m128; + XB[0] = zb[0].m128; + XC[0] = zc[0].m128; + XA[1] = za[1].m128; + XB[1] = zb[1].m128; + XC[1] = zc[1].m128; + XA[2] = za[2].m128; + XB[2] = zb[2].m128; + XC[2] = zc[2].m128; + XA[3] = za[3].m128; + XB[3] = zb[3].m128; + XC[3] = zc[3].m128; + +#endif +} -// Working. -// Quadruple buffered, 4x memory usage +// Triple buffered, 3x memory usage // No interleaving -static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, - uint32_t *bd, const uint32_t *ca, const uint32_t *cb, - const uint32_t *cc, const uint32_t *cd ) +static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, + const uint32_t *ca, const uint32_t *cb, const uint32_t *cc ) { __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, - XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3; + XC0, XC1, XC2, XC3; __m128i *BA = (__m128i*)ba; __m128i *BB = (__m128i*)bb; __m128i *BC = (__m128i*)bc; - __m128i *BD = (__m128i*)bd; const __m128i *CA = (const __m128i*)ca; const __m128i *CB = (const __m128i*)cb; const __m128i *CC = (const __m128i*)cc; - const __m128i *CD = (const __m128i*)cd; // define targets for macros used in round function template #define ROL_1X32 mm128_shufll_32 @@ -3396,397 +3032,42 @@ static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, #define XOR _mm_xor_si128 #define TYPE __m128i - // mix C into B then shuffle B into X - BA[0] = _mm_xor_si128( BA[0], CA[0] ); - BB[0] = _mm_xor_si128( BB[0], CB[0] ); - BC[0] = _mm_xor_si128( BC[0], CC[0] ); - BD[0] = _mm_xor_si128( BD[0], CD[0] ); - BA[1] = _mm_xor_si128( BA[1], CA[1] ); - BB[1] = _mm_xor_si128( BB[1], CB[1] ); - BC[1] = _mm_xor_si128( BC[1], CC[1] ); - BD[1] = _mm_xor_si128( BD[1], CD[1] ); - BA[2] = _mm_xor_si128( BA[2], CA[2] ); - BB[2] = _mm_xor_si128( BB[2], CB[2] ); - BC[2] = _mm_xor_si128( BC[2], CC[2] ); - BD[2] = _mm_xor_si128( BD[2], CD[2] ); - BA[3] = _mm_xor_si128( BA[3], CA[3] ); - BB[3] = _mm_xor_si128( BB[3], CB[3] ); - BC[3] = _mm_xor_si128( BC[3], CC[3] ); - BD[3] = _mm_xor_si128( BD[3], CD[3] ); - -#if defined(__SSE4_1__) - - __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, - YC0, YC1, YC2, YC3, YD0, YD1, YD2, YD3; - -#if defined(__AVX2__) - - YA0 = _mm_blend_epi32( BA[1], BA[0], 0x1 ); - YB0 = _mm_blend_epi32( BB[1], BB[0], 0x1 ); - YC0 = _mm_blend_epi32( BC[1], BC[0], 0x1 ); - YD0 = _mm_blend_epi32( BD[1], BD[0], 0x1 ); - XA0 = _mm_blend_epi32( BA[3], BA[2], 0x4 ); - XB0 = _mm_blend_epi32( BB[3], BB[2], 0x4 ); - XC0 = _mm_blend_epi32( BC[3], BC[2], 0x4 ); - XD0 = _mm_blend_epi32( BD[3], BD[2], 0x4 ); - - YA1 = _mm_blend_epi32( BA[2], BA[1], 0x1 ); - YB1 = _mm_blend_epi32( BB[2], BB[1], 0x1 ); - YC1 = _mm_blend_epi32( BC[2], BC[1], 0x1 ); - YD1 = _mm_blend_epi32( BD[2], BD[1], 0x1 ); - XA1 = _mm_blend_epi32( BA[0], BA[3], 0x4 ); - XB1 = _mm_blend_epi32( BB[0], BB[3], 0x4 ); - XC1 = _mm_blend_epi32( BC[0], BC[3], 0x4 ); - XD1 = _mm_blend_epi32( BD[0], BD[3], 0x4 ); - - YA2 = _mm_blend_epi32( BA[3], BA[2], 0x1 ); - YB2 = _mm_blend_epi32( BB[3], BB[2], 0x1 ); - YC2 = _mm_blend_epi32( BC[3], BC[2], 0x1 ); - YD2 = _mm_blend_epi32( BD[3], BD[2], 0x1 ); - XA2 = _mm_blend_epi32( BA[1], BA[0], 0x4 ); - XB2 = _mm_blend_epi32( BB[1], BB[0], 0x4 ); - XC2 = _mm_blend_epi32( BC[1], BC[0], 0x4 ); - XD2 = _mm_blend_epi32( BD[1], BD[0], 0x4 ); - - YA3 = _mm_blend_epi32( BA[0], BA[3], 0x1 ); - YB3 = _mm_blend_epi32( BB[0], BB[3], 0x1 ); - YC3 = _mm_blend_epi32( BC[0], BC[3], 0x1 ); - YD3 = _mm_blend_epi32( BD[0], BD[3], 0x1 ); - XA3 = _mm_blend_epi32( BA[2], BA[1], 0x4 ); - XB3 = _mm_blend_epi32( BB[2], BB[1], 0x4 ); - XC3 = _mm_blend_epi32( BC[2], BC[1], 0x4 ); - XD3 = _mm_blend_epi32( BD[2], BD[1], 0x4 ); - - XA0 = _mm_blend_epi32( XA0, YA0, 0x3 ); - XB0 = _mm_blend_epi32( XB0, YB0, 0x3 ); - XC0 = _mm_blend_epi32( XC0, YC0, 0x3 ); - XD0 = _mm_blend_epi32( XD0, YD0, 0x3 ); - - XA1 = _mm_blend_epi32( XA1, YA1, 0x3 ); - XB1 = _mm_blend_epi32( XB1, YB1, 0x3 ); - XC1 = _mm_blend_epi32( XC1, YC1, 0x3 ); - XD1 = _mm_blend_epi32( XD1, YD1, 0x3 ); - - XA2 = _mm_blend_epi32( XA2, YA2, 0x3 ); - XB2 = _mm_blend_epi32( XB2, YB2, 0x3 ); - XC2 = _mm_blend_epi32( XC2, YC2, 0x3 ); - XD2 = _mm_blend_epi32( XD2, YD2, 0x3 ); - - XA3 = _mm_blend_epi32( XA3, YA3, 0x3 ); - XB3 = _mm_blend_epi32( XB3, YB3, 0x3 ); - XC3 = _mm_blend_epi32( XC3, YC3, 0x3 ); - XD3 = _mm_blend_epi32( XD3, YD3, 0x3 ); - -#else // SSE4_1 - - YA0 = _mm_blend_epi16( BA[1], BA[0], 0x03 ); - YB0 = _mm_blend_epi16( BB[1], BB[0], 0x03 ); - YC0 = _mm_blend_epi16( BC[1], BC[0], 0x03 ); - YD0 = _mm_blend_epi16( BD[1], BD[0], 0x03 ); - XA0 = _mm_blend_epi16( BA[3], BA[2], 0x30 ); - XB0 = _mm_blend_epi16( BB[3], BB[2], 0x30 ); - XC0 = _mm_blend_epi16( BC[3], BC[2], 0x30 ); - XD0 = _mm_blend_epi16( BD[3], BD[2], 0x30 ); - XA0 = _mm_blend_epi16( XA0, YA0, 0x0f ); - XB0 = _mm_blend_epi16( XB0, YB0, 0x0f ); - XC0 = _mm_blend_epi16( XC0, YC0, 0x0f ); - XD0 = _mm_blend_epi16( XD0, YD0, 0x0f ); - - YA1 = _mm_blend_epi16( BA[2], BA[1], 0x03 ); - YB1 = _mm_blend_epi16( BB[2], BB[1], 0x03 ); - YC1 = _mm_blend_epi16( BC[2], BC[1], 0x03 ); - YD1 = _mm_blend_epi16( BD[2], BD[1], 0x03 ); - XA1 = _mm_blend_epi16( BA[0], BA[3], 0x30 ); - XB1 = _mm_blend_epi16( BB[0], BB[3], 0x30 ); - XC1 = _mm_blend_epi16( BC[0], BC[3], 0x30 ); - XD1 = _mm_blend_epi16( BD[0], BD[3], 0x30 ); - XA1 = _mm_blend_epi16( XA1, YA1, 0x0f ); - XB1 = _mm_blend_epi16( XB1, YB1, 0x0f ); - XC1 = _mm_blend_epi16( XC1, YC1, 0x0f ); - XD1 = _mm_blend_epi16( XD1, YD1, 0x0f ); - - YA2 = _mm_blend_epi16( BA[3], BA[2], 0x03 ); - YB2 = _mm_blend_epi16( BB[3], BB[2], 0x03 ); - YC2 = _mm_blend_epi16( BC[3], BC[2], 0x03 ); - YD2 = _mm_blend_epi16( BD[3], BD[2], 0x03 ); - XA2 = _mm_blend_epi16( BA[1], BA[0], 0x30 ); - XB2 = _mm_blend_epi16( BB[1], BB[0], 0x30 ); - XC2 = _mm_blend_epi16( BC[1], BC[0], 0x30 ); - XD2 = _mm_blend_epi16( BD[1], BD[0], 0x30 ); - XA2 = _mm_blend_epi16( XA2, YA2, 0x0f ); - XB2 = _mm_blend_epi16( XB2, YB2, 0x0f ); - XC2 = _mm_blend_epi16( XC2, YC2, 0x0f ); - XD2 = _mm_blend_epi16( XD2, YD2, 0x0f ); - - YA3 = _mm_blend_epi16( BA[0], BA[3], 0x03 ); - YB3 = _mm_blend_epi16( BB[0], BB[3], 0x03 ); - YC3 = _mm_blend_epi16( BC[0], BC[3], 0x03 ); - YD3 = _mm_blend_epi16( BD[0], BD[3], 0x03 ); - XA3 = _mm_blend_epi16( BA[2], BA[1], 0x30 ); - XB3 = _mm_blend_epi16( BB[2], BB[1], 0x30 ); - XC3 = _mm_blend_epi16( BC[2], BC[1], 0x30 ); - XD3 = _mm_blend_epi16( BD[2], BD[1], 0x30 ); - XA3 = _mm_blend_epi16( XA3, YA3, 0x0f ); - XB3 = _mm_blend_epi16( XB3, YB3, 0x0f ); - XC3 = _mm_blend_epi16( XC3, YC3, 0x0f ); - XD3 = _mm_blend_epi16( XD3, YD3, 0x0f ); - -#endif // AVX2 else SSE3_1 - - SALSA_8ROUNDS_SIMD128_4BUF; - -#if defined(__AVX2__) - - YA0 = _mm_blend_epi32( XA0, XA1, 0x8 ); - YB0 = _mm_blend_epi32( XB0, XB1, 0x8 ); - YC0 = _mm_blend_epi32( XC0, XC1, 0x8 ); - YD0 = _mm_blend_epi32( XD0, XD1, 0x8 ); - YA1 = _mm_blend_epi32( XA0, XA1, 0x1 ); - YB1 = _mm_blend_epi32( XB0, XB1, 0x1 ); - YC1 = _mm_blend_epi32( XC0, XC1, 0x1 ); - YD1 = _mm_blend_epi32( XD0, XD1, 0x1 ); - YA2 = _mm_blend_epi32( XA0, XA1, 0x2 ); - YB2 = _mm_blend_epi32( XB0, XB1, 0x2 ); - YC2 = _mm_blend_epi32( XC0, XC1, 0x2 ); - YD2 = _mm_blend_epi32( XD0, XD1, 0x2 ); - YA3 = _mm_blend_epi32( XA0, XA1, 0x4 ); - YB3 = _mm_blend_epi32( XB0, XB1, 0x4 ); - YC3 = _mm_blend_epi32( XC0, XC1, 0x4 ); - YD3 = _mm_blend_epi32( XD0, XD1, 0x4 ); - - YA0 = _mm_blend_epi32( YA0, XA2, 0x4 ); - YB0 = _mm_blend_epi32( YB0, XB2, 0x4 ); - YC0 = _mm_blend_epi32( YC0, XC2, 0x4 ); - YD0 = _mm_blend_epi32( YD0, XD2, 0x4 ); - YA1 = _mm_blend_epi32( YA1, XA2, 0x8 ); - YB1 = _mm_blend_epi32( YB1, XB2, 0x8 ); - YC1 = _mm_blend_epi32( YC1, XC2, 0x8 ); - YD1 = _mm_blend_epi32( YD1, XD2, 0x8 ); - YA2 = _mm_blend_epi32( YA2, XA2, 0x1 ); - YB2 = _mm_blend_epi32( YB2, XB2, 0x1 ); - YC2 = _mm_blend_epi32( YC2, XC2, 0x1 ); - YD2 = _mm_blend_epi32( YD2, XD2, 0x1 ); - YA3 = _mm_blend_epi32( YA3, XA2, 0x2 ); - YB3 = _mm_blend_epi32( YB3, XB2, 0x2 ); - YC3 = _mm_blend_epi32( YC3, XC2, 0x2 ); - YD3 = _mm_blend_epi32( YD3, XD2, 0x2 ); - - YA0 = _mm_blend_epi32( YA0, XA3, 0x2 ); - YB0 = _mm_blend_epi32( YB0, XB3, 0x2 ); - YC0 = _mm_blend_epi32( YC0, XC3, 0x2 ); - YD0 = _mm_blend_epi32( YD0, XD3, 0x2 ); - YA1 = _mm_blend_epi32( YA1, XA3, 0x4 ); - YB1 = _mm_blend_epi32( YB1, XB3, 0x4 ); - YC1 = _mm_blend_epi32( YC1, XC3, 0x4 ); - YD1 = _mm_blend_epi32( YD1, XD3, 0x4 ); - YA2 = _mm_blend_epi32( YA2, XA3, 0x8 ); - YB2 = _mm_blend_epi32( YB2, XB3, 0x8 ); - YC2 = _mm_blend_epi32( YC2, XC3, 0x8 ); - YD2 = _mm_blend_epi32( YD2, XD3, 0x8 ); - YA3 = _mm_blend_epi32( YA3, XA3, 0x1 ); - YB3 = _mm_blend_epi32( YB3, XB3, 0x1 ); - YC3 = _mm_blend_epi32( YC3, XC3, 0x1 ); - YD3 = _mm_blend_epi32( YD3, XD3, 0x1 ); - -#else // SSE4_1 - - YA0 = _mm_blend_epi16( XA0, XA1, 0xc0 ); - YB0 = _mm_blend_epi16( XB0, XB1, 0xc0 ); - YC0 = _mm_blend_epi16( XC0, XC1, 0xc0 ); - YD0 = _mm_blend_epi16( XD0, XD1, 0xc0 ); - YA1 = _mm_blend_epi16( XA0, XA1, 0x03 ); - YB1 = _mm_blend_epi16( XB0, XB1, 0x03 ); - YC1 = _mm_blend_epi16( XC0, XC1, 0x03 ); - YD1 = _mm_blend_epi16( XD0, XD1, 0x03 ); - YA2 = _mm_blend_epi16( XA0, XA1, 0x0c ); - YB2 = _mm_blend_epi16( XB0, XB1, 0x0c ); - YC2 = _mm_blend_epi16( XC0, XC1, 0x0c ); - YD2 = _mm_blend_epi16( XD0, XD1, 0x0c ); - YA3 = _mm_blend_epi16( XA0, XA1, 0x30 ); - YB3 = _mm_blend_epi16( XB0, XB1, 0x30 ); - YC3 = _mm_blend_epi16( XC0, XC1, 0x30 ); - YD3 = _mm_blend_epi16( XD0, XD1, 0x30 ); - - YA0 = _mm_blend_epi16( YA0, XA2, 0x30 ); - YB0 = _mm_blend_epi16( YB0, XB2, 0x30 ); - YC0 = _mm_blend_epi16( YC0, XC2, 0x30 ); - YD0 = _mm_blend_epi16( YD0, XD2, 0x30 ); - YA1 = _mm_blend_epi16( YA1, XA2, 0xc0 ); - YB1 = _mm_blend_epi16( YB1, XB2, 0xc0 ); - YC1 = _mm_blend_epi16( YC1, XC2, 0xc0 ); - YD1 = _mm_blend_epi16( YD1, XD2, 0xc0 ); - YA2 = _mm_blend_epi16( YA2, XA2, 0x03 ); - YB2 = _mm_blend_epi16( YB2, XB2, 0x03 ); - YC2 = _mm_blend_epi16( YC2, XC2, 0x03 ); - YD2 = _mm_blend_epi16( YD2, XD2, 0x03 ); - YA3 = _mm_blend_epi16( YA3, XA2, 0x0c ); - YB3 = _mm_blend_epi16( YB3, XB2, 0x0c ); - YC3 = _mm_blend_epi16( YC3, XC2, 0x0c ); - YD3 = _mm_blend_epi16( YD3, XD2, 0x0c ); - - YA0 = _mm_blend_epi16( YA0, XA3, 0x0c ); - YB0 = _mm_blend_epi16( YB0, XB3, 0x0c ); - YC0 = _mm_blend_epi16( YC0, XC3, 0x0c ); - YD0 = _mm_blend_epi16( YD0, XD3, 0x0c ); - YA1 = _mm_blend_epi16( YA1, XA3, 0x30 ); - YB1 = _mm_blend_epi16( YB1, XB3, 0x30 ); - YC1 = _mm_blend_epi16( YC1, XC3, 0x30 ); - YD1 = _mm_blend_epi16( YD1, XD3, 0x30 ); - YA2 = _mm_blend_epi16( YA2, XA3, 0xc0 ); - YB2 = _mm_blend_epi16( YB2, XB3, 0xc0 ); - YC2 = _mm_blend_epi16( YC2, XC3, 0xc0 ); - YD2 = _mm_blend_epi16( YD2, XD3, 0xc0 ); - YA3 = _mm_blend_epi16( YA3, XA3, 0x03 ); - YB3 = _mm_blend_epi16( YB3, XB3, 0x03 ); - YC3 = _mm_blend_epi16( YC3, XC3, 0x03 ); - YD3 = _mm_blend_epi16( YD3, XD3, 0x03 ); - -#endif // AVX2 else SSE4_1 - - BA[0] = _mm_add_epi32( BA[0], YA0 ); - BB[0] = _mm_add_epi32( BB[0], YB0 ); - BC[0] = _mm_add_epi32( BC[0], YC0 ); - BD[0] = _mm_add_epi32( BD[0], YD0 ); - BA[1] = _mm_add_epi32( BA[1], YA1 ); - BB[1] = _mm_add_epi32( BB[1], YB1 ); - BC[1] = _mm_add_epi32( BC[1], YC1 ); - BD[1] = _mm_add_epi32( BD[1], YD1 ); - BA[2] = _mm_add_epi32( BA[2], YA2 ); - BB[2] = _mm_add_epi32( BB[2], YB2 ); - BC[2] = _mm_add_epi32( BC[2], YC2 ); - BD[2] = _mm_add_epi32( BD[2], YD2 ); - BA[3] = _mm_add_epi32( BA[3], YA3 ); - BB[3] = _mm_add_epi32( BB[3], YB3 ); - BC[3] = _mm_add_epi32( BC[3], YC3 ); - BD[3] = _mm_add_epi32( BD[3], YD3 ); - -#else // SSE2 - - m128_ovly ya[4], yb[4], za[4], zb[4], yc[4], zc[4], yd[4], zd[4]; - - XA0 = _mm_set_epi32( ba[15], ba[10], ba[ 5], ba[ 0] ); - XB0 = _mm_set_epi32( bb[15], bb[10], bb[ 5], bb[ 0] ); - XC0 = _mm_set_epi32( bc[15], bc[10], bc[ 5], bc[ 0] ); - XD0 = _mm_set_epi32( bd[15], bd[10], bd[ 5], bd[ 0] ); - XA1 = _mm_set_epi32( ba[ 3], ba[14], ba[ 9], ba[ 4] ); - XB1 = _mm_set_epi32( bb[ 3], bb[14], bb[ 9], bb[ 4] ); - XC1 = _mm_set_epi32( bc[ 3], bc[14], bc[ 9], bc[ 4] ); - XD1 = _mm_set_epi32( bd[ 3], bd[14], bd[ 9], bd[ 4] ); - XA2 = _mm_set_epi32( ba[ 7], ba[ 2], ba[13], ba[ 8] ); - XB2 = _mm_set_epi32( bb[ 7], bb[ 2], bb[13], bb[ 8] ); - XC2 = _mm_set_epi32( bc[ 7], bc[ 2], bc[13], bc[ 8] ); - XD2 = _mm_set_epi32( bd[ 7], bd[ 2], bd[13], bd[ 8] ); - XA3 = _mm_set_epi32( ba[11], ba[ 6], ba[ 1], ba[12] ); - XB3 = _mm_set_epi32( bb[11], bb[ 6], bb[ 1], bb[12] ); - XC3 = _mm_set_epi32( bc[11], bc[ 6], bc[ 1], bc[12] ); - XD3 = _mm_set_epi32( bd[11], bd[ 6], bd[ 1], bd[12] ); - - SALSA_8ROUNDS_FINAL_SIMD128_4BUF; - - ya[0].m128 = XA0; - yb[0].m128 = XB0; - yc[0].m128 = XC0; - yd[0].m128 = XD0; - ya[1].m128 = XA1; - yb[1].m128 = XB1; - yc[1].m128 = XC1; - yd[1].m128 = XD1; - ya[2].m128 = XA2; - yb[2].m128 = XB2; - yc[2].m128 = XC2; - yd[2].m128 = XD2; - ya[3].m128 = XA3; - yb[3].m128 = XB3; - yc[3].m128 = XC3; - yd[3].m128 = XD3; - - za[0].u32[0] = ya[0].u32[0]; - zb[0].u32[0] = yb[0].u32[0]; - zc[0].u32[0] = yc[0].u32[0]; - zd[0].u32[0] = yd[0].u32[0]; - za[0].u32[3] = ya[1].u32[0]; - zb[0].u32[3] = yb[1].u32[0]; - zc[0].u32[3] = yc[1].u32[0]; - zd[0].u32[3] = yd[1].u32[0]; - za[0].u32[2] = ya[2].u32[0]; - zb[0].u32[2] = yb[2].u32[0]; - zc[0].u32[2] = yc[2].u32[0]; - zd[0].u32[2] = yd[2].u32[0]; - za[0].u32[1] = ya[3].u32[0]; - zb[0].u32[1] = yb[3].u32[0]; - zc[0].u32[1] = yc[3].u32[0]; - zd[0].u32[1] = yd[3].u32[0]; - - za[1].u32[1] = ya[0].u32[1]; - zb[1].u32[1] = yb[0].u32[1]; - zc[1].u32[1] = yc[0].u32[1]; - zd[1].u32[1] = yd[0].u32[1]; - za[1].u32[0] = ya[1].u32[1]; - zb[1].u32[0] = yb[1].u32[1]; - zc[1].u32[0] = yc[1].u32[1]; - zd[1].u32[0] = yd[1].u32[1]; - za[1].u32[3] = ya[2].u32[1]; - zb[1].u32[3] = yb[2].u32[1]; - zc[1].u32[3] = yc[2].u32[1]; - zd[1].u32[3] = yd[2].u32[1]; - za[1].u32[2] = ya[3].u32[1]; - zb[1].u32[2] = yb[3].u32[1]; - zc[1].u32[2] = yc[3].u32[1]; - zd[1].u32[2] = yd[3].u32[1]; + XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] ); + XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] ); + XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] ); + XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] ); + XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] ); + XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] ); + XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] ); + XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] ); + XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] ); + XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] ); + XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] ); + XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] ); + +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + SALSA_8ROUNDS_SIMD128_3BUF; - za[2].u32[2] = ya[0].u32[2]; - zb[2].u32[2] = yb[0].u32[2]; - zc[2].u32[2] = yc[0].u32[2]; - zd[2].u32[2] = yd[0].u32[2]; - za[2].u32[1] = ya[1].u32[2]; - zb[2].u32[1] = yb[1].u32[2]; - zc[2].u32[1] = yc[1].u32[2]; - zd[2].u32[1] = yd[1].u32[2]; - za[2].u32[0] = ya[2].u32[2]; - zb[2].u32[0] = yb[2].u32[2]; - zc[2].u32[0] = yc[2].u32[2]; - zd[2].u32[0] = yd[2].u32[2]; - za[2].u32[3] = ya[3].u32[2]; - zb[2].u32[3] = yb[3].u32[2]; - zc[2].u32[3] = yc[3].u32[2]; - zd[2].u32[3] = yd[3].u32[2]; +#else - za[3].u32[3] = ya[0].u32[3]; - zb[3].u32[3] = yb[0].u32[3]; - zc[3].u32[3] = yc[0].u32[3]; - zd[3].u32[3] = yd[0].u32[3]; - za[3].u32[2] = ya[1].u32[3]; - zb[3].u32[2] = yb[1].u32[3]; - zc[3].u32[2] = yc[1].u32[3]; - zd[3].u32[2] = yd[1].u32[3]; - za[3].u32[1] = ya[2].u32[3]; - zb[3].u32[1] = yb[2].u32[3]; - zc[3].u32[1] = yc[2].u32[3]; - zd[3].u32[1] = yd[2].u32[3]; - za[3].u32[0] = ya[3].u32[3]; - zb[3].u32[0] = yb[3].u32[3]; - zc[3].u32[0] = yc[3].u32[3]; - zd[3].u32[0] = yd[3].u32[3]; - - BA[0] = _mm_add_epi32( BA[0], za[0].m128 ); - BB[0] = _mm_add_epi32( BB[0], zb[0].m128 ); - BC[0] = _mm_add_epi32( BC[0], zc[0].m128 ); - BD[0] = _mm_add_epi32( BD[0], zd[0].m128 ); - BA[1] = _mm_add_epi32( BA[1], za[1].m128 ); - BB[1] = _mm_add_epi32( BB[1], zb[1].m128 ); - BC[1] = _mm_add_epi32( BC[1], zc[1].m128 ); - BD[1] = _mm_add_epi32( BD[1], zd[1].m128 ); - BA[2] = _mm_add_epi32( BA[2], za[2].m128 ); - BB[2] = _mm_add_epi32( BB[2], zb[2].m128 ); - BC[2] = _mm_add_epi32( BC[2], zc[2].m128 ); - BD[2] = _mm_add_epi32( BD[2], zd[2].m128 ); - BA[3] = _mm_add_epi32( BA[3], za[3].m128 ); - BB[3] = _mm_add_epi32( BB[3], zb[3].m128 ); - BC[3] = _mm_add_epi32( BC[3], zc[3].m128 ); - BD[3] = _mm_add_epi32( BD[3], zd[3].m128 ); + SALSA_8ROUNDS_SIMD128_3BUF_SLOROT; #endif + BA[0] = _mm_add_epi32( BA[0], XA0 ); + BB[0] = _mm_add_epi32( BB[0], XB0 ); + BC[0] = _mm_add_epi32( BC[0], XC0 ); + BA[1] = _mm_add_epi32( BA[1], XA1 ); + BB[1] = _mm_add_epi32( BB[1], XB1 ); + BC[1] = _mm_add_epi32( BC[1], XC1 ); + BA[2] = _mm_add_epi32( BA[2], XA2 ); + BB[2] = _mm_add_epi32( BB[2], XB2 ); + BC[2] = _mm_add_epi32( BC[2], XC2 ); + BA[3] = _mm_add_epi32( BA[3], XA3 ); + BB[3] = _mm_add_epi32( BB[3], XB3 ); + BC[3] = _mm_add_epi32( BC[3], XC3 ); + #undef ROL_1X32 #undef ROR_1X32 #undef SWAP_64 @@ -3796,105 +3077,108 @@ static void salsa8_simd128_4buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, #undef TYPE } -void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N ) +void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ) { - uint32_t *X0 = X; - uint32_t *X1 = X+32; - uint32_t *X2 = X+64; - uint32_t *X3 = X+96; - uint32_t *V0 = V; - uint32_t *V1 = V + 32*N; - uint32_t *V2 = V + 64*N; - uint32_t *V3 = V + 96*N; - - for ( int i = 0; i < N; i++ ) + uint32_t *X0 = X; + uint32_t *X1 = X+32; + uint32_t *X2 = X+64; + uint32_t *V0 = V; + uint32_t *V1 = V + 32*N; + uint32_t *V2 = V + 64*N; + + salsa_simd128_shuffle_3buf( X0, X1, X2 ); + salsa_simd128_shuffle_3buf( X0+16, X1+16, X2+16 ); + + for ( int n = 0; n < N; n++ ) { - for ( int k = 0; k < 8; k++ ) + #if defined(__AVX__) + + for ( int i = 0; i < 4; i++ ) { - _mm_stream_si128( (__m128i*)V0 + i*8 + k, casti_m128i( X0, k ) ); - _mm_stream_si128( (__m128i*)V1 + i*8 + k, casti_m128i( X1, k ) ); - _mm_stream_si128( (__m128i*)V2 + i*8 + k, casti_m128i( X2, k ) ); - _mm_stream_si128( (__m128i*)V3 + i*8 + k, casti_m128i( X3, k ) ); + _mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) ); + _mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) ); + _mm256_stream_si256( (__m256i*)V2 + n*4 + i, casti_m256i( X2, i ) ); } - salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], - &X0[16], &X1[16], &X2[16], &X3[16] ); - salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16], - &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); - } - for ( int i = 0; i < N; i++ ) - { - #if defined(__AVX2__) - - const int j0 = 4 * ( X0[16] & ( N - 1 ) ); - const int j1 = 4 * ( X1[16] & ( N - 1 ) ); - const int j2 = 4 * ( X2[16] & ( N - 1 ) ); - const int j3 = 4 * ( X3[16] & ( N - 1 ) ); + #elif defined(__SSE4_1__) - for ( int k = 0; k < 4; k++ ) + for ( int i = 0; i < 8; i++ ) { - const __m256i v0 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+k ); - const __m256i v1 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+k ); - const __m256i v2 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+k ); - const __m256i v3 = _mm256_stream_load_si256( ( (__m256i*)V3 ) +j3+k ); - casti_m256i( X0, k ) = _mm256_xor_si256( casti_m256i( X0, k ), v0 ); - casti_m256i( X1, k ) = _mm256_xor_si256( casti_m256i( X1, k ), v1 ); - casti_m256i( X2, k ) = _mm256_xor_si256( casti_m256i( X2, k ), v2 ); - casti_m256i( X3, k ) = _mm256_xor_si256( casti_m256i( X3, k ), v3 ); + _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) ); + _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) ); + _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) ); } #else - - const int j0 = 8 * ( X0[16] & ( N - 1 ) ); - const int j1 = 8 * ( X1[16] & ( N - 1 ) ); - const int j2 = 8 * ( X2[16] & ( N - 1 ) ); - const int j3 = 8 * ( X3[16] & ( N - 1 ) ); - for ( int k = 0; k < 8; k++ ) - { - #if defined(__SSE4_1__) - const __m128i v0 = _mm_stream_load_si128( ( (__m128i*)V0 ) +j0+k ); - const __m128i v1 = _mm_stream_load_si128( ( (__m128i*)V1 ) +j1+k ); - const __m128i v2 = _mm_stream_load_si128( ( (__m128i*)V2 ) +j2+k ); - const __m128i v3 = _mm_stream_load_si128( ( (__m128i*)V3 ) +j3+k ); - #else - const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+k ); - const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+k ); - const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+k ); - const __m128i v3 = _mm_load_si128( ( (__m128i*)V3 ) +j3+k ); - #endif - casti_m128i( X0, k ) = _mm_xor_si128( casti_m128i( X0, k ), v0 ); - casti_m128i( X1, k ) = _mm_xor_si128( casti_m128i( X1, k ), v1 ); - casti_m128i( X2, k ) = _mm_xor_si128( casti_m128i( X2, k ), v2 ); - casti_m128i( X3, k ) = _mm_xor_si128( casti_m128i( X3, k ), v3 ); - } + memcpy( &V0[ n*32 ], X0, 128 ); + memcpy( &V1[ n*32 ], X1, 128 ); + memcpy( &V2[ n*32 ], X2, 128 ); - #endif + #endif -/* - const int j0 = 16 * ( X0[16] & ( N - 1 ) ); - const int j1 = 16 * ( X1[16] & ( N - 1 ) ); - const int j2 = 16 * ( X2[16] & ( N - 1 ) ); - const int j3 = 16 * ( X3[16] & ( N - 1 ) ); + salsa8_simd128_3buf( X0, X1, X2 , X0+16, X1+16, X2+16 ); + salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0, X1, X2 ); + } - for ( int k = 0; k < 16; k++ ) + for ( int n = 0; n < N; n++ ) + { + #if defined(__AVX2__) + + const int j0 = 4 * ( X0[16] & ( N-1 ) ); + const int j1 = 4 * ( X1[16] & ( N-1 ) ); + const int j2 = 4 * ( X2[16] & ( N-1 ) ); + + const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0 ); + const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1 ); + const __m256i v20 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2 ); + const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 ); + const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 ); + const __m256i v21 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+1 ); + const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 ); + const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 ); + const __m256i v22 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+2 ); + const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 ); + const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 ); + const __m256i v23 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+3 ); + + casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 ); + casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 ); + casti_m256i( X2, 0 ) = _mm256_xor_si256( casti_m256i( X2, 0 ), v20 ); + casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 ); + casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 ); + casti_m256i( X2, 1 ) = _mm256_xor_si256( casti_m256i( X2, 1 ), v21 ); + casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 ); + casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 ); + casti_m256i( X2, 2 ) = _mm256_xor_si256( casti_m256i( X2, 2 ), v22 ); + casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 ); + casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 ); + casti_m256i( X2, 3 ) = _mm256_xor_si256( casti_m256i( X2, 3 ), v23 ); + + #else + + const int j0 = 8 * ( X0[16] & ( N-1 ) ); + const int j1 = 8 * ( X1[16] & ( N-1 ) ); + const int j2 = 8 * ( X2[16] & ( N-1 ) ); + for ( int i = 0; i < 8; i++ ) { - const uint64_t v0 = ( (uint64_t*)V0 )[ j0+k ]; - const uint64_t v1 = ( (uint64_t*)V1 )[ j1+k ]; - const uint64_t v2 = ( (uint64_t*)V2 )[ j2+k ]; - const uint64_t v3 = ( (uint64_t*)V3 )[ j3+k ]; - ( (uint64_t*)X0 )[k] ^= v0; - ( (uint64_t*)X1 )[k] ^= v1; - ( (uint64_t*)X2 )[k] ^= v2; - ( (uint64_t*)X3 )[k] ^= v3; + const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i ); + const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i ); + const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i ); + casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 ); + casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 ); + casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 ); } -*/ - salsa8_simd128_4buf( &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0], - &X0[16], &X1[16], &X2[16], &X3[16] ); - salsa8_simd128_4buf( &X0[16], &X1[16], &X2[16], &X3[16], - &X0[ 0], &X1[ 0], &X2[ 0], &X3[ 0] ); + #endif + + salsa8_simd128_3buf( X0, X1, X2 , X0+16, X1+16, X2+16 ); + salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0, X1, X2 ); } + + salsa_simd128_unshuffle_3buf( X0, X1, X2 ); + salsa_simd128_unshuffle_3buf( X0+16, X1+16, X2+16 ); + } @@ -3961,17 +3245,17 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C) void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N ) { - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { - memcpy( &V[i * 32], X, 128 ); + memcpy( &V[ n*32 ], X, 128 ); xor_salsa8( &X[ 0], &X[16] ); xor_salsa8( &X[16], &X[ 0] ); } - for ( int i = 0; i < N; i++ ) + for ( int n = 0; n < N; n++ ) { int j = 32 * ( X[16] & ( N - 1 ) ); - for ( int k = 0; k < 32; k++ ) - X[k] ^= V[j + k]; + for ( int i = 0; i < 32; i++ ) + X[i] ^= V[ j+i ]; xor_salsa8( &X[ 0], &X[16] ); xor_salsa8( &X[16], &X[ 0] ); } diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index a15b5cb1..e919ccb3 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -146,6 +146,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, output[i] = bswap_32( ostate[i] ); } +#if defined(__SHA__) + +static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, + const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1, + uint32_t *ostate0, uint32_t *ostate1 ) +{ + uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16]; + int i; + + memcpy( pad0, key0 + 16, 16 ); + memcpy( pad0 + 4, keypad, 48 ); + memcpy( pad1, key1 + 16, 16 ); + memcpy( pad1 + 4, keypad, 48 ); + + sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, + tstate0, tstate1 ); + + memcpy( ihash0, tstate0, 32 ); + memcpy( ihash1, tstate1, 32 ); + + for ( i = 0; i < 8; i++ ) + { + pad0[i] = ihash0[i] ^ 0x5c5c5c5c; + pad1[i] = ihash1[i] ^ 0x5c5c5c5c; + } + for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c; + + sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1, + sha256_initial_state, sha256_initial_state ); + + for ( i = 0; i < 8; i++ ) + { + pad0[i] = ihash0[i] ^ 0x36363636; + pad1[i] = ihash1[i] ^ 0x36363636; + } + for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x36363636; + + sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, + sha256_initial_state, sha256_initial_state ); +} + +static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0, + const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1, + const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0, + uint32_t *output1 ) +{ + uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8]; + uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16]; + int i, j; + + sha256_ni2way_transform_le( istate0, istate1, salt0, salt1, + tstate0, tstate1 ); + + memcpy( ibuf0, salt0 + 16, 16 ); + memcpy( ibuf0 + 5, innerpad, 44 ); + memcpy( obuf0 + 8, outerpad, 32 ); + memcpy( ibuf1, salt1 + 16, 16 ); + memcpy( ibuf1 + 5, innerpad, 44 ); + memcpy( obuf1 + 8, outerpad, 32 ); + + for ( i = 0; i < 4; i++ ) + { + memcpy( obuf0, istate0, 32 ); + memcpy( obuf1, istate1, 32 ); + ibuf0[4] = ibuf1[4] = i + 1; + + sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1, + obuf0, obuf1 ); + sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1, + ostate0, ostate1 ); + + for ( j = 0; j < 8; j++ ) + { + output0[ 8*i + j ] = bswap_32( ostateb0[j] ); + output1[ 8*i + j ] = bswap_32( ostateb1[j] ); + } + } +} + +static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0, + uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1, + const uint32_t *salt0, const uint32_t *salt1, + uint32_t *output0, uint32_t *output1 ) +{ + uint32_t buf0[16], buf1[16]; + int i; + + sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1, + tstate0, tstate1 ); + sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16, + tstate0, tstate1 ); + sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk, + tstate0, tstate1 ); + + memcpy( buf0, tstate0, 32 ); + memcpy( buf0 + 8, outerpad, 32 ); + memcpy( buf1, tstate1, 32 ); + memcpy( buf1 + 8, outerpad, 32 ); + + sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1, + ostate0, ostate1 ); + + for ( i = 0; i < 8; i++ ) + { + output0[i] = bswap_32( ostate0[i] ); + output1[i] = bswap_32( ostate1[i] ); + } +} + + + +#endif + #ifdef HAVE_SHA256_4WAY static const uint32_t keypad_4way[4 * 12] = { @@ -643,10 +756,10 @@ static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) { - uint32_t _ALIGN(128) tstate[8 * 8]; - uint32_t _ALIGN(128) ostate[8 * 8]; - uint32_t _ALIGN(128) W[8 * 32]; - uint32_t _ALIGN(128) X[8 * 32]; + uint32_t _ALIGN(128) tstate[ 8*8 ]; + uint32_t _ALIGN(128) ostate[ 8*8 ]; + uint32_t _ALIGN(128) W[ 8*32 ]; + uint32_t _ALIGN(128) X[ 8*32 ]; uint32_t *V = (uint32_t*)scratchpad; intrlv_8x32( W, input, input+ 20, input+ 40, input+ 60, @@ -658,53 +771,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W ); dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 ); + + if ( opt_param_n > 0x4000 ) + { + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + } + else + { + intrlv_2x128( W, X, X+ 32, 1024 ); + intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); + intrlv_2x128( W+128, X+128, X+160, 1024 ); + intrlv_2x128( W+192, X+192, X+224, 1024 ); + scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + dintrlv_2x128( X, X+ 32, W, 1024 ); + dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); + dintrlv_2x128( X+128, X+160, W+128, 1024 ); + dintrlv_2x128( X+192, X+224, W+192, 1024 ); + } + // SCRYPT CORE - - // AVX512 - -/* - // AVX512 16 way working - intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, - X+256, X+256+32, X+256+64, X+256+96, X+256+128, - X+256+160, X+256+192, X+256+224, 1024 ); - - scrypt_core_16way( (__m512i*)W , (__m512i*)V, N ); - - dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, - X+256, X+256+32, X+256+64, X+256+96, X+256+128, - X+256+160, X+256+192, X+256+224, W, 1024 ); -*/ -/* - // AVX512 working - intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); - intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 ); - scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); - if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N ); - dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); - dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); -*/ -/* - // AVX512, not working, very slow - intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); - intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); - scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N ); - if ( work_restart[thrid].restart ) return 0; - scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); - dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); - dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); -*/ - // AVX2 -/* + // AVX2 // disable de/interleave for testing. - scrypt_core_8way( (__m256i*)W , (__m256i*)V, N ); -*/ +// scrypt_core_8way( (__m256i*)W , (__m256i*)V, N ); + /* // AVX2 working @@ -714,23 +819,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, intrlv_2x128( W+192, X+192, X+224, 1024 ); // working -// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N ); +// scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); // if ( work_restart[thrid].restart ) return 0; -// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); +// scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); // working - scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); + scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); if ( work_restart[thrid].restart ) return 0; - scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); - - // working -// scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); dintrlv_2x128( X, X+ 32, W, 1024 ); dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); @@ -745,18 +845,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, intrlv_2x32( W+128, X+128, X+160, 1024 ); intrlv_2x32( W+192, X+192, X+224, 1024 ); - // working, deprecated, not up to data -// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N ); - - // deprecated, not up to date -// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); - // working -// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); + scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); // scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N ); // if ( work_restart[thrid].restart ) return 0; @@ -813,19 +905,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_2buf( X+192, V, N ); */ - +/************** scrypt_core_simd128_3buf( X, V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_3buf( X+ 96, V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_2buf( X+192, V, N ); - -/* - // SSE2 working - scrypt_core_simd128_4buf( X, V, N ); - if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4buf( X+128, V, N ); -*/ +*************/ if ( work_restart[thrid].restart ) return 0; @@ -868,6 +954,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, W, 1024 ); + if ( opt_param_n > 0x4000 ) + { + scrypt_core_simd128_3buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+ 96, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+192, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+256, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_3buf( X+352, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+448, V, N ); + } + else + { + intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); + intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); + intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 ); + intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 ); + scrypt_core_4way_simd128( (__m512i*) W, (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N ); + dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); + dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); + dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 ); + dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 ); + } + // SCRYPT CORE @@ -888,23 +1007,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, // AVX512 working intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 ); + intrlv_4x32( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 ); + intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 ); scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N ); dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); + dintrlv_4x32( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 ); + dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 ); */ /* - // AVX512, not working, very slow + // AVX512, working intrlv_4x128( W, X, X+ 32, X+ 64, X+ 96, 1024 ); intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 ); + intrlv_4x128( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 ); + intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 ); scrypt_core_4way_simd128( (__m512i*)W, (__m512i*)V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N ); dintrlv_4x128( X, X+ 32, X+ 64, X+ 96, W, 1024 ); dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 ); + dintrlv_4x128( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 ); + dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 ); */ + // AVX2 /* @@ -919,16 +1055,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 ); intrlv_2x128( W+128, X+128, X+160, 1024 ); intrlv_2x128( W+192, X+192, X+224, 1024 ); - - // working -// scrypt_core_2way_simd128_3buf( (__m256i*) W, (__m256i*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); + intrlv_2x128( W+256, X+256, X+256+ 32, 1024 ); + intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 ); + intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 ); + intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 ); // working scrypt_core_2way_simd128_2buf( (__m256i*) W, (__m256i*)V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+256), (__m256i*)V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N ); // working // scrypt_core_2way_simd128( (__m256i*) W, (__m256i*)V, N ); @@ -938,11 +1077,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, // scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N ); // if ( work_restart[thrid].restart ) return 0; // scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N ); dintrlv_2x128( X, X+ 32, W, 1024 ); dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 ); dintrlv_2x128( X+128, X+160, W+128, 1024 ); dintrlv_2x128( X+192, X+224, W+192, 1024 ); + dintrlv_2x128( X+256, X+256+ 32, W+256, 1024 ); + dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 ); + dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 ); + dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 ); */ /* @@ -952,18 +1103,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, intrlv_2x32( W+128, X+128, X+160, 1024 ); intrlv_2x32( W+192, X+192, X+224, 1024 ); - // working, deprecated, not up to data -// scrypt_core_simd128_2way_4buf( (uint64_t*)W, (uint64_t*)V, N ); - - // deprecated, not up to date -// scrypt_core_simd128_2way_3buf( (uint64_t*) W, (uint64_t*)V, N ); -// if ( work_restart[thrid].restart ) return 0; -// scrypt_core_simd128_2way( (uint64_t*)( W+192 ), (uint64_t*)V, N ); - // working // scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); // if ( work_restart[thrid].restart ) return 0; // scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); +// scrypt_core_simd128_2way_2buf( (uint64_t*) W, (uint64_t*)V, N ); +// if ( work_restart[thrid].restart ) return 0; +// scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N ); // scrypt_core_simd128_2way( (uint64_t*) W, (uint64_t*)V, N ); // if ( work_restart[thrid].restart ) return 0; @@ -1043,7 +1189,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_2buf( X+448, V, N ); */ - +/*************** scrypt_core_simd128_3buf( X, V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_3buf( X+ 96, V, N ); @@ -1055,17 +1201,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, scrypt_core_simd128_3buf( X+352, V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_2buf( X+448, V, N ); - -/* - // SSE2 working - scrypt_core_simd128_4buf( X, V, N ); - if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4buf( X+128, V, N ); - if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4buf( X+256, V, N ); - if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4buf( X+384, V, N ); -*/ +********************/ /* scrypt_core_3way( X, V, N ); if ( work_restart[thrid].restart ) return 0; @@ -1102,6 +1238,31 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, #if defined(__SHA__) +static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) +{ + uint32_t _ALIGN(128) tstate[ 2*8 ]; + uint32_t _ALIGN(128) ostate[ 2*8 ]; + uint32_t _ALIGN(128) W[ 2*32 ]; + uint32_t *V = (uint32_t*)scratchpad; + + memcpy( tstate, midstate, 32 ); + memcpy( tstate+ 8, midstate, 32 ); + + HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8, + ostate, ostate+8 ); + PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, + input, input+20, W, W+32 ); + + scrypt_core_simd128_2buf( W, V, N ); + if ( work_restart[thrid].restart ) return 0; + + PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32, + output, output+8 ); + + return 1; +} + static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) { @@ -1149,8 +1310,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output, scrypt_core_simd128( W+96, V, N ); */ - // working -// scrypt_core_simd128_4buf( W, V, N ); if ( work_restart[thrid].restart ) return 0; @@ -1171,10 +1330,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output, static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N, int thrid ) { - uint32_t _ALIGN(128) tstate[4 * 8]; - uint32_t _ALIGN(128) ostate[4 * 8]; - uint32_t _ALIGN(128) W[4 * 32]; - uint32_t _ALIGN(128) X[4 * 32]; + uint32_t _ALIGN(128) tstate[ 4*8 ]; + uint32_t _ALIGN(128) ostate[ 4*8 ]; + uint32_t _ALIGN(128) W[ 4*32 ]; uint32_t *V = (uint32_t*)scratchpad; intrlv_4x32( W, input, input+20, input+40, input+60, 640 ); @@ -1184,7 +1342,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, HMAC_SHA256_80_init_4way(W, tstate, ostate); PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); + if ( opt_param_n > 0x4000 ) + { + uint32_t _ALIGN(128) X[ 4*32 ]; + dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); + scrypt_core_simd128_2buf( X, V, N ); + if ( work_restart[thrid].restart ) return 0; + scrypt_core_simd128_2buf( X+64, V, N ); + intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); + } + else + scrypt_core_4way( (__m128i*)W, (__m128i*)V, N ); + + + +// dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 ); ////// SCRYPT_CORE @@ -1202,35 +1374,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128( X+96, V, N ); */ - +/* // working, double buffered linear simd, best for n2 scrypt_core_simd128_2buf( X, V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128_2buf( X+64, V, N ); - +*/ /* scrypt_core_simd128_3buf( X, V, N ); if ( work_restart[thrid].restart ) return 0; scrypt_core_simd128( X+96, V, N ); */ - // working -// scrypt_core_simd128_4buf( X, V, N ); - - -/* - // original - scrypt_core(X + 0 * 32, V, N); - scrypt_core(X + 1 * 32, V, N); - scrypt_core(X + 2 * 32, V, N); - scrypt_core(X + 3 * 32, V, N); -*/ - //////////////////////////////// if ( work_restart[thrid].restart ) return 0; - intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); +// intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); @@ -1247,22 +1407,22 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; + uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; + uint32_t midstate[8]; + uint32_t n = pdata[19] - 1; int thr_id = mythr->id; int throughput = scrypt_throughput; - int i; + int i; volatile uint8_t *restart = &(work_restart[thr_id].restart); - for ( i = 0; i < throughput; i++ ) - memcpy( data + i * 20, pdata, 80 ); + for ( i = 0; i < throughput; i++ ) + memcpy( data + i * 20, pdata, 80 ); sha256_transform_le( midstate, data, sha256_initial_state ); - do { + do { bool rc = true; - for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n; + for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n; #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) if ( throughput == 16 ) @@ -1276,7 +1436,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, opt_param_n, thr_id ); else #endif - if ( throughput == 4 ) + if ( throughput == 4 ) // slower on Ryzen than 8way #if defined(__SHA__) rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf, opt_param_n, thr_id ); @@ -1284,10 +1444,17 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf, opt_param_n, thr_id ); #endif +#if defined(__SHA__) else + if (throughput == 2 ) // slower on Ryzen than 4way_sha & 8way + rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf, + opt_param_n, thr_id ); +#endif + else // should never get here rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf, opt_param_n, thr_id ); + // test the hash if ( rc ) for ( i = 0; i < throughput; i++ ) { @@ -1319,11 +1486,11 @@ bool scrypt_miner_thread_init( int thr_id ) bool register_scrypt_algo( algo_gate_t* gate ) { -#if defined(__SHA__) - gate->optimizations = SSE2_OPT | SHA_OPT; -#else +//#if defined(__SHA__) +// gate->optimizations = SSE2_OPT | SHA_OPT; +//#else gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; -#endif +//#endif gate->miner_thread_init =(void*)&scrypt_miner_thread_init; gate->scanhash = (void*)&scanhash_scrypt; opt_target_factor = 65536.0; @@ -1332,16 +1499,29 @@ bool register_scrypt_algo( algo_gate_t* gate ) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) scrypt_throughput = 16; - scratchbuf_size = opt_param_n * 3 * 128; // 3 buf + if ( opt_param_n > 0x4000 ) + scratchbuf_size = opt_param_n * 3 * 128; // 3 buf + else + scratchbuf_size = opt_param_n * 4 * 128; // 4 way + +/* SHA is slower than AVX2 on Ryzen #elif defined(__SHA__) scrypt_throughput = 4; scratchbuf_size = opt_param_n * 2 * 128; // 2 buf +*/ + #elif defined(__AVX2__) scrypt_throughput = 8; - scratchbuf_size = opt_param_n * 3 * 128; // 3 buf + if ( opt_param_n > 0x4000 ) + scratchbuf_size = opt_param_n * 3 * 128; // 3 buf + else + scratchbuf_size = opt_param_n * 2 * 128; // 2 way #else scrypt_throughput = 4; + if ( opt_param_n > 0x4000 ) scratchbuf_size = opt_param_n * 2 * 128; // 2 buf + else + scratchbuf_size = opt_param_n * 4 * 128; // 4 way #endif char t_units[4] = {0}; diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index 7b6618c4..de3f1d43 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -51,7 +51,6 @@ typedef struct { __m128i buf[64>>2]; __m128i val[8]; uint32_t count_high, count_low; - bool initialized; } sha256_4way_context __attribute__ ((aligned (64))); void sha256_4way_init( sha256_4way_context *sc ); @@ -74,7 +73,6 @@ typedef struct { __m256i buf[64>>2]; __m256i val[8]; uint32_t count_high, count_low; - bool initialized; } sha256_8way_context __attribute__ ((aligned (128))); void sha256_8way_init( sha256_8way_context *sc ); @@ -96,7 +94,6 @@ typedef struct { __m512i buf[64>>2]; __m512i val[8]; uint32_t count_high, count_low; - bool initialized; } sha256_16way_context __attribute__ ((aligned (128))); void sha256_16way_init( sha256_16way_context *sc ); diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index beac702c..1c630cc8 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -107,22 +107,19 @@ do { \ } while (0) // LE data, no need to byte swap -void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, - const __m128i *state_in ) +static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W, + const __m128i *in ) { __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; - __m128i W[16]; - - memcpy_128( W, data, 16 ); - A = state_in[0]; - B = state_in[1]; - C = state_in[2]; - D = state_in[3]; - E = state_in[4]; - F = state_in[5]; - G = state_in[6]; - H = state_in[7]; + A = in[0]; + B = in[1]; + C = in[2]; + D = in[3]; + E = in[4]; + F = in[5]; + G = in[6]; + H = in[7]; Y_xor_Z = _mm_xor_si128( B, C ); SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); @@ -179,228 +176,46 @@ void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); } - state_out[0] = _mm_add_epi32( state_in[0], A ); - state_out[1] = _mm_add_epi32( state_in[1], B ); - state_out[2] = _mm_add_epi32( state_in[2], C ); - state_out[3] = _mm_add_epi32( state_in[3], D ); - state_out[4] = _mm_add_epi32( state_in[4], E ); - state_out[5] = _mm_add_epi32( state_in[5], F ); - state_out[6] = _mm_add_epi32( state_in[6], G ); - state_out[7] = _mm_add_epi32( state_in[7], H ); + out[0] = _mm_add_epi32( in[0], A ); + out[1] = _mm_add_epi32( in[1], B ); + out[2] = _mm_add_epi32( in[2], C ); + out[3] = _mm_add_epi32( in[3], D ); + out[4] = _mm_add_epi32( in[4], E ); + out[5] = _mm_add_epi32( in[5], F ); + out[6] = _mm_add_epi32( in[6], G ); + out[7] = _mm_add_epi32( in[7], H ); } -// BE data, need to byte swap -void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, - const __m128i *state_in ) +// LE data, no need to byte swap +void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) { - __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m128i W[16]; - - mm128_block_bswap_32( W, data ); - mm128_block_bswap_32( W+8, data+8 ); - - A = state_in[0]; - B = state_in[1]; - C = state_in[2]; - D = state_in[3]; - E = state_in[4]; - F = state_in[5]; - G = state_in[6]; - H = state_in[7]; - Y_xor_Z = _mm_xor_si128( B, C ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - state_out[0] = _mm_add_epi32( state_in[0], A ); - state_out[1] = _mm_add_epi32( state_in[1], B ); - state_out[2] = _mm_add_epi32( state_in[2], C ); - state_out[3] = _mm_add_epi32( state_in[3], D ); - state_out[4] = _mm_add_epi32( state_in[4], E ); - state_out[5] = _mm_add_epi32( state_in[5], F ); - state_out[6] = _mm_add_epi32( state_in[6], G ); - state_out[7] = _mm_add_epi32( state_in[7], H ); + memcpy_128( W, data, 16 ); + SHA256_4WAY_TRANSFORM( state_out, W, state_in ); } - -static void -sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) +// BE data, need to byte swap input data +void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) { - register __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; __m128i W[16]; - - mm128_block_bswap_32( W, in ); - mm128_block_bswap_32( W+8, in+8 ); - - if ( ctx->initialized ) - { - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - } - else - { - A = m128_const1_64( 0x6A09E6676A09E667 ); - B = m128_const1_64( 0xBB67AE85BB67AE85 ); - C = m128_const1_64( 0x3C6EF3723C6EF372 ); - D = m128_const1_64( 0xA54FF53AA54FF53A ); - E = m128_const1_64( 0x510E527F510E527F ); - F = m128_const1_64( 0x9B05688C9B05688C ); - G = m128_const1_64( 0x1F83D9AB1F83D9AB ); - H = m128_const1_64( 0x5BE0CD195BE0CD19 ); - } - - Y_xor_Z = _mm_xor_si128( B, C ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - if ( ctx->initialized ) - { - r[0] = _mm_add_epi32( r[0], A ); - r[1] = _mm_add_epi32( r[1], B ); - r[2] = _mm_add_epi32( r[2], C ); - r[3] = _mm_add_epi32( r[3], D ); - r[4] = _mm_add_epi32( r[4], E ); - r[5] = _mm_add_epi32( r[5], F ); - r[6] = _mm_add_epi32( r[6], G ); - r[7] = _mm_add_epi32( r[7], H ); - } - else - { - ctx->initialized = true; - r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) ); - r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) ); - r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) ); - r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) ); - r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) ); - r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) ); - r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) ); - r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) ); - } + mm128_block_bswap_32( W, data ); + mm128_block_bswap_32( W+8, data+8 ); + SHA256_4WAY_TRANSFORM( state_out, W, state_in ); } void sha256_4way_init( sha256_4way_context *sc ) { - sc->initialized = false; sc->count_high = sc->count_low = 0; -/* - sc->val[0] = _mm_set1_epi32( H256[0] ); - sc->val[1] = _mm_set1_epi32( H256[1] ); - sc->val[2] = _mm_set1_epi32( H256[2] ); - sc->val[3] = _mm_set1_epi32( H256[3] ); - sc->val[4] = _mm_set1_epi32( H256[4] ); - sc->val[5] = _mm_set1_epi32( H256[5] ); - sc->val[6] = _mm_set1_epi32( H256[6] ); - sc->val[7] = _mm_set1_epi32( H256[7] ); -*/ + sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m128_const1_64( 0x510E527F510E527F ); + sc->val[5] = m128_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); } void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) @@ -424,7 +239,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_4way_round( sc, sc->buf, sc->val ); + sha256_4way_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -449,7 +264,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_4way_round( sc, sc->buf, sc->val ); + sha256_4way_transform_be( sc->val, sc->buf, sc->val ); memset_zero_128( sc->buf, pad >> 2 ); } else @@ -461,7 +276,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) ); sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) ); - sha256_4way_round( sc, sc->buf, sc->val ); + sha256_4way_transform_be( sc->val, sc->buf, sc->val ); mm128_block_bswap_32( dst, sc->val ); } @@ -539,8 +354,7 @@ do { \ #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ do { \ - __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \ - W[ i ] ); \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \ __m256i T1 = BSG2_1x( E ); \ __m256i T2 = BSG2_0x( A ); \ T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ @@ -552,45 +366,74 @@ do { \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) -/* -#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ +// the X_xor_y technique can be extended to eliminate the mov instruction. +// Perform double rounds and alternate each round. Doesn't apply to AVX512 +// and isn't suitable for running 3 round prehash. +// +// read Y_xor_Z, update X_xor_Y +#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \ + _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ + Y_xor_Z ) ) + +// start with toc initialized to y^z: toc = B ^ C +// First round reads toc as Y_xor_Z and saves X_xor_Y as tic. +// Second round reads tic as Y_xor_Z and saves X_xor_Y as toc. + +#define SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, i0, i1, j ) \ do { \ - __m256i T1, T2; \ - __m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \ - T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ - Y_xor_Z = X_xor_Y; \ + __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \ + W[ i0 ] ); \ + __m256i T1 = BSG2_1x( E ); \ + __m256i T2 = BSG2_0x( A ); \ + T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ + T1 = _mm256_add_epi32( T1, H ); \ + T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ D = _mm256_add_epi32( D, T1 ); \ H = _mm256_add_epi32( T1, T2 ); \ +\ + T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \ + W[ (i1) ] ); \ + T1 = BSG2_1x( D ); \ + T2 = BSG2_0x( H ); \ + T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \ + T1 = _mm256_add_epi32( T1, G ); \ + T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \ + T1 = _mm256_add_epi32( T1, T0 ); \ + C = _mm256_add_epi32( C, T1 ); \ + G = _mm256_add_epi32( T1, T2 ); \ } while (0) -*/ #endif // AVX512VL else AVX2 -// accepts LE byte ordered data, skip the byte swap -void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, - const __m256i *state_in ) +static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, + const __m256i *in ) \ { __m256i A, B, C, D, E, F, G, H; -#if !defined(__AVX512VL__) - __m256i X_xor_Y, Y_xor_Z; -#endif - __m256i W[16]; - memcpy_256( W, data, 16 ); - A = state_in[0]; - B = state_in[1]; - C = state_in[2]; - D = state_in[3]; - E = state_in[4]; - F = state_in[5]; - G = state_in[6]; - H = state_in[7]; + A = _mm256_load_si256( in ); + B = _mm256_load_si256( in+1 ); + C = _mm256_load_si256( in+2 ); + D = _mm256_load_si256( in+3 ); + E = _mm256_load_si256( in+4 ); + F = _mm256_load_si256( in+5 ); + G = _mm256_load_si256( in+6 ); + H = _mm256_load_si256( in+7 ); #if !defined(__AVX512VL__) - Y_xor_Z = _mm256_xor_si256( B, C ); -#endif + + __m256i tic, toc = _mm256_xor_si256( B, C ); + + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 0, 1, 0 ); + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 2, 3, 0 ); + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 4, 5, 0 ); + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 6, 7, 0 ); + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 8, 9, 0 ); + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, 0 ); + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, 0 ); + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, 0 ); + +#else SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); @@ -609,6 +452,8 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); +#endif + for ( int j = 16; j < 64; j += 16 ) { W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); @@ -628,6 +473,19 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); +#if !defined(__AVX512VL__) + + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 0, 1, j ); + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 2, 3, j ); + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 4, 5, j ); + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 6, 7, j ); + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 8, 9, j ); + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j ); + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j ); + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j ); + +#else + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); @@ -644,244 +502,52 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + +#endif } - state_out[0] = _mm256_add_epi32( state_in[0], A ); - state_out[1] = _mm256_add_epi32( state_in[1], B ); - state_out[2] = _mm256_add_epi32( state_in[2], C ); - state_out[3] = _mm256_add_epi32( state_in[3], D ); - state_out[4] = _mm256_add_epi32( state_in[4], E ); - state_out[5] = _mm256_add_epi32( state_in[5], F ); - state_out[6] = _mm256_add_epi32( state_in[6], G ); - state_out[7] = _mm256_add_epi32( state_in[7], H ); + out[0] = _mm256_add_epi32( in[0], A ); + out[1] = _mm256_add_epi32( in[1], B ); + out[2] = _mm256_add_epi32( in[2], C ); + out[3] = _mm256_add_epi32( in[3], D ); + out[4] = _mm256_add_epi32( in[4], E ); + out[5] = _mm256_add_epi32( in[5], F ); + out[6] = _mm256_add_epi32( in[6], G ); + out[7] = _mm256_add_epi32( in[7], H ); } - -// Accepts BE byte ordered data, need to byte swap -void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, +// accepts LE input data +void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, const __m256i *state_in ) { - __m256i A, B, C, D, E, F, G, H; -#if !defined(__AVX512VL__) - __m256i X_xor_Y, Y_xor_Z; -#endif __m256i W[16]; - - mm256_block_bswap_32( W , data ); - mm256_block_bswap_32( W+8, data+8 ); - - A = state_in[0]; - B = state_in[1]; - C = state_in[2]; - D = state_in[3]; - E = state_in[4]; - F = state_in[5]; - G = state_in[6]; - H = state_in[7]; - -#if !defined(__AVX512VL__) - Y_xor_Z = _mm256_xor_si256( B, C ); -#endif - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - state_out[0] = _mm256_add_epi32( state_in[0], A ); - state_out[1] = _mm256_add_epi32( state_in[1], B ); - state_out[2] = _mm256_add_epi32( state_in[2], C ); - state_out[3] = _mm256_add_epi32( state_in[3], D ); - state_out[4] = _mm256_add_epi32( state_in[4], E ); - state_out[5] = _mm256_add_epi32( state_in[5], F ); - state_out[6] = _mm256_add_epi32( state_in[6], G ); - state_out[7] = _mm256_add_epi32( state_in[7], H ); + memcpy_256( W, data, 16 ); + SHA256_8WAY_TRANSFORM( state_out, W, state_in ); } -static void -sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) +// Accepts BE input data, need to bswap +void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) { - register __m256i A, B, C, D, E, F, G, H; -#if !defined(__AVX512VL__) - __m256i X_xor_Y, Y_xor_Z; -#endif __m256i W[16]; - - mm256_block_bswap_32( W , in ); - mm256_block_bswap_32( W+8, in+8 ); - - if ( ctx->initialized ) - { - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - } - else - { - A = m256_const1_64( 0x6A09E6676A09E667 ); - B = m256_const1_64( 0xBB67AE85BB67AE85 ); - C = m256_const1_64( 0x3C6EF3723C6EF372 ); - D = m256_const1_64( 0xA54FF53AA54FF53A ); - E = m256_const1_64( 0x510E527F510E527F ); - F = m256_const1_64( 0x9B05688C9B05688C ); - G = m256_const1_64( 0x1F83D9AB1F83D9AB ); - H = m256_const1_64( 0x5BE0CD195BE0CD19 ); - } - -#if !defined(__AVX512VL__) - Y_xor_Z = _mm256_xor_si256( B, C ); -#endif - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - if ( ctx->initialized ) - { - r[0] = _mm256_add_epi32( r[0], A ); - r[1] = _mm256_add_epi32( r[1], B ); - r[2] = _mm256_add_epi32( r[2], C ); - r[3] = _mm256_add_epi32( r[3], D ); - r[4] = _mm256_add_epi32( r[4], E ); - r[5] = _mm256_add_epi32( r[5], F ); - r[6] = _mm256_add_epi32( r[6], G ); - r[7] = _mm256_add_epi32( r[7], H ); - } - else - { - ctx->initialized = true; - r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) ); - r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) ); - r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) ); - r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) ); - r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) ); - r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) ); - r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) ); - r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) ); - } + mm256_block_bswap_32( W , data ); + mm256_block_bswap_32( W+8, data+8 ); + SHA256_8WAY_TRANSFORM( state_out, W, state_in ); } void sha256_8way_init( sha256_8way_context *sc ) { - sc->initialized = false; sc->count_high = sc->count_low = 0; -/* - sc->val[0] = _mm256_set1_epi32( H256[0] ); - sc->val[1] = _mm256_set1_epi32( H256[1] ); - sc->val[2] = _mm256_set1_epi32( H256[2] ); - sc->val[3] = _mm256_set1_epi32( H256[3] ); - sc->val[4] = _mm256_set1_epi32( H256[4] ); - sc->val[5] = _mm256_set1_epi32( H256[5] ); - sc->val[6] = _mm256_set1_epi32( H256[6] ); - sc->val[7] = _mm256_set1_epi32( H256[7] ); -*/ + sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m256_const1_64( 0x510E527F510E527F ); + sc->val[5] = m256_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); } - // need to handle odd byte length for yespower. // Assume only last update is odd. @@ -906,7 +572,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_8way_round( sc, sc->buf, sc->val ); + sha256_8way_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -931,7 +597,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_8way_round( sc, sc->buf, sc->val ); + sha256_8way_transform_be( sc->val, sc->buf, sc->val ); memset_zero_256( sc->buf, pad >> 2 ); } else @@ -944,7 +610,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) ); - sha256_8way_round( sc, sc->buf, sc->val ); + sha256_8way_transform_be( sc->val, sc->buf, sc->val ); mm256_block_bswap_32( dst, sc->val ); } @@ -986,8 +652,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) #define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ do { \ - __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[ (j)+(i) ] ), \ - W[ i ] ); \ + __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \ __m512i T1 = BSG2_1x16( E ); \ __m512i T2 = BSG2_0x16( A ); \ T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \ @@ -1011,23 +676,19 @@ do { \ } while (0) */ -// accepts LE input data -void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, - const __m512i *state_in ) + +static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W, + const __m512i *in ) \ { __m512i A, B, C, D, E, F, G, H; - __m512i W[16]; - - memcpy_512( W, data, 16 ); - - A = state_in[0]; - B = state_in[1]; - C = state_in[2]; - D = state_in[3]; - E = state_in[4]; - F = state_in[5]; - G = state_in[6]; - H = state_in[7]; + A = _mm512_load_si512( in ); + B = _mm512_load_si512( in+1 ); + C = _mm512_load_si512( in+2 ); + D = _mm512_load_si512( in+3 ); + E = _mm512_load_si512( in+4 ); + F = _mm512_load_si512( in+5 ); + G = _mm512_load_si512( in+6 ); + H = _mm512_load_si512( in+7 ); SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); @@ -1083,100 +744,36 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); } - state_out[0] = _mm512_add_epi32( state_in[0], A ); - state_out[1] = _mm512_add_epi32( state_in[1], B ); - state_out[2] = _mm512_add_epi32( state_in[2], C ); - state_out[3] = _mm512_add_epi32( state_in[3], D ); - state_out[4] = _mm512_add_epi32( state_in[4], E ); - state_out[5] = _mm512_add_epi32( state_in[5], F ); - state_out[6] = _mm512_add_epi32( state_in[6], G ); - state_out[7] = _mm512_add_epi32( state_in[7], H ); + out[0] = _mm512_add_epi32( in[0], A ); + out[1] = _mm512_add_epi32( in[1], B ); + out[2] = _mm512_add_epi32( in[2], C ); + out[3] = _mm512_add_epi32( in[3], D ); + out[4] = _mm512_add_epi32( in[4], E ); + out[5] = _mm512_add_epi32( in[5], F ); + out[6] = _mm512_add_epi32( in[6], G ); + out[7] = _mm512_add_epi32( in[7], H ); +} + +// accepts LE input data +void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i W[16]; + memcpy_512( W, data, 16 ); + SHA256_16WAY_TRANSFORM( state_out, W, state_in ); } // Accepts BE input data, need to bswap void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, const __m512i *state_in ) { - __m512i A, B, C, D, E, F, G, H; __m512i W[16]; - mm512_block_bswap_32( W , data ); mm512_block_bswap_32( W+8, data+8 ); - - A = state_in[0]; - B = state_in[1]; - C = state_in[2]; - D = state_in[3]; - E = state_in[4]; - F = state_in[5]; - G = state_in[6]; - H = state_in[7]; - - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); - - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - state_out[0] = _mm512_add_epi32( state_in[0], A ); - state_out[1] = _mm512_add_epi32( state_in[1], B ); - state_out[2] = _mm512_add_epi32( state_in[2], C ); - state_out[3] = _mm512_add_epi32( state_in[3], D ); - state_out[4] = _mm512_add_epi32( state_in[4], E ); - state_out[5] = _mm512_add_epi32( state_in[5], F ); - state_out[6] = _mm512_add_epi32( state_in[6], G ); - state_out[7] = _mm512_add_epi32( state_in[7], H ); + SHA256_16WAY_TRANSFORM( state_out, W, state_in ); } - -// Aggresive prehashing + +// Aggresive prehashing, LE byte order void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, const __m512i *state_in ) { @@ -1295,125 +892,19 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, _mm512_store_si512( state_out + 7, H ); } -static void -sha256_16way_round( sha256_16way_context *ctx, __m512i *in, __m512i r[8] ) -{ - register __m512i A, B, C, D, E, F, G, H; - __m512i W[16]; - - mm512_block_bswap_32( W , in ); - mm512_block_bswap_32( W+8, in+8 ); - - if ( ctx->initialized ) - { - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - } - else - { - A = m512_const1_64( 0x6A09E6676A09E667 ); - B = m512_const1_64( 0xBB67AE85BB67AE85 ); - C = m512_const1_64( 0x3C6EF3723C6EF372 ); - D = m512_const1_64( 0xA54FF53AA54FF53A ); - E = m512_const1_64( 0x510E527F510E527F ); - F = m512_const1_64( 0x9B05688C9B05688C ); - G = m512_const1_64( 0x1F83D9AB1F83D9AB ); - H = m512_const1_64( 0x5BE0CD195BE0CD19 ); - } - - - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); - - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } - - if ( ctx->initialized ) - { - r[0] = _mm512_add_epi32( r[0], A ); - r[1] = _mm512_add_epi32( r[1], B ); - r[2] = _mm512_add_epi32( r[2], C ); - r[3] = _mm512_add_epi32( r[3], D ); - r[4] = _mm512_add_epi32( r[4], E ); - r[5] = _mm512_add_epi32( r[5], F ); - r[6] = _mm512_add_epi32( r[6], G ); - r[7] = _mm512_add_epi32( r[7], H ); - } - else - { - ctx->initialized = true; - r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) ); - r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) ); - r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) ); - r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) ); - r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) ); - r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) ); - r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) ); - r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) ); - } -} - void sha256_16way_init( sha256_16way_context *sc ) { - sc->initialized = false; sc->count_high = sc->count_low = 0; + sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m512_const1_64( 0x510E527F510E527F ); + sc->val[5] = m512_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 ); } - void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len ) { @@ -1436,7 +927,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data, len -= clen; if ( ptr == buf_size ) { - sha256_16way_round( sc, sc->buf, sc->val ); + sha256_16way_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -1461,7 +952,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_16way_round( sc, sc->buf, sc->val ); + sha256_16way_transform_be( sc->val, sc->buf, sc->val ); memset_zero_512( sc->buf, pad >> 2 ); } else @@ -1474,7 +965,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) ); - sha256_16way_round( sc, sc->buf, sc->val ); + sha256_16way_transform_be( sc->val, sc->buf, sc->val ); mm512_block_bswap_32( dst, sc->val ); } diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index c53cb39f..8225595b 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -70,6 +70,8 @@ extern "C"{ C8, C9, CA, CB, CC, CD, CE, CF; \ __m256i M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ + const __m256i FIVE = _mm256_set1_epi32( 5 ); \ + const __m256i THREE = _mm256_set1_epi32( 3 ); \ sph_u32 Wlow, Whigh; #define READ_STATE8(state) do \ @@ -314,8 +316,7 @@ do { \ _mm256_andnot_si256( xb3, xb2 ), \ _mm256_mullo_epi32( mm256_xor3( xa0, xc, \ _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \ - _mm256_set1_epi32(5UL) ) ), \ - _mm256_set1_epi32(3UL) ) ) ); \ + FIVE ) ), THREE ) ) ); \ xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \ } while (0) @@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) C8, C9, CA, CB, CC, CD, CE, CF; \ __m128i M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u32 Wlow, Whigh; + const __m128i FIVE = _mm_set1_epi32( 5 ); \ + const __m128i THREE = _mm_set1_epi32( 3 ); \ + sph_u32 Wlow, Whigh; #define READ_STATE(state) do \ { \ @@ -931,8 +934,8 @@ do { \ xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \ _mm_andnot_si128( xb3, xb2 ), \ _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \ - _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \ - ) ), _mm_set1_epi32(3UL) ) ) ) ); \ + _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \ + ) ), THREE ) ) ) ); \ xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \ } while (0) diff --git a/algo/swifftx/inttypes.h b/algo/swifftx/inttypes.h index 2b6b941b..9f74eee2 100644 --- a/algo/swifftx/inttypes.h +++ b/algo/swifftx/inttypes.h @@ -18,16 +18,20 @@ #ifndef __INTTYPES_H_ #define __INTTYPES_H_ +#include + /* Use [u]intN_t if you need exactly N bits. XXX - doesn't handle the -mint8 option. */ typedef signed char swift_int8_t; typedef unsigned char swift_uint8_t; - typedef int swift_int16_t; + typedef int32_t swift_int16_t; +// typedef int swift_int16_t; typedef unsigned int swift_uint16_t; - typedef long swift_int32_t; + typedef int32_t swift_int32_t; +// typedef long swift_int32_t; typedef unsigned long swift_uint32_t; typedef long long swift_int64_t; diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c index f38ea854..d3ecd15c 100644 --- a/algo/swifftx/swifftx.c +++ b/algo/swifftx/swifftx.c @@ -18,6 +18,8 @@ //#include "stdbool.h" #include +#include "simd-utils.h" + /////////////////////////////////////////////////////////////////////////////////////////////// // Constants and static tables portion. /////////////////////////////////////////////////////////////////////////////////////////////// @@ -49,20 +51,20 @@ // - A: the first operand. After the operation stores the sum of the two operands. // - B: the second operand. After the operation stores the difference between the first and the // second operands. -#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));} +//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));} // Quickly reduces an integer modulo 257. // // Parameters: // - A: the input. -#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8)) +//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8)) // Since we need to do the setup only once, this is the indicator variable: static bool wasSetupDone = false; // This array stores the powers of omegas that correspond to the indices, which are the input // values. Known also as the "outer FFT twiddle factors". -swift_int16_t multipliers[N]; +swift_int16_t multipliers[N] __attribute__ ((aligned (64))); // This array stores the powers of omegas, multiplied by the corresponding values. // We store this table to save computation time. @@ -72,14 +74,14 @@ swift_int16_t multipliers[N]; // compression function, i is between 0 and 31, x_i is a 64-bit value. // One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper -- // formula (2), section 3, page 6. -swift_int16_t fftTable[256 * EIGHTH_N]; +swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64))); // The A's we use in SWIFFTX shall be random elements of Z_257. // We generated these A's from the decimal expansion of PI as follows: we converted each // triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A // element, otherwise move to the next triple of digits in the expansion. This guarntees that // the A's are random, provided that PI digits are. -const swift_int16_t As[3 * M * N] = +const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) = {141, 78, 139, 75, 238, 205, 129, 126, 22, 245, 197, 169, 142, 118, 105, 78, 50, 149, 29, 208, 114, 34, 85, 117, 67, 148, 86, 256, 25, 49, 133, 93, 95, 36, 68, 231, 211, 102, 151, 128, 224, 117, 193, 27, 102, 187, 7, 105, @@ -636,9 +638,202 @@ void InitializeSWIFFTX() wasSetupDone = true; } +// In the original code the F matrix is rotated so it was not aranged +// the same as all the other data. Rearanging F to match all the other +// data made vectorizing possible, the compiler probably could have been +// able to auto-vectorize with proper data organisation. +// Also in the original code the custom 16 bit data types are all now 32 +// bit int32_t regardless of the type name. +// void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) { - swift_int16_t *mult = multipliers; +#if defined(__AVX2__) + + __m256i F[8] __attribute__ ((aligned (64))); + __m256i *mul = (__m256i*)multipliers; + __m256i *out = (__m256i*)output; + __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] ); + + F[0] = _mm256_mullo_epi32( mul[0], *tbl ); + tbl = (__m256i*)&( fftTable[ input[1] << 3 ] ); + F[1] = _mm256_mullo_epi32( mul[1], *tbl ); + tbl = (__m256i*)&( fftTable[ input[2] << 3 ] ); + F[2] = _mm256_mullo_epi32( mul[2], *tbl ); + tbl = (__m256i*)&( fftTable[ input[3] << 3 ] ); + F[3] = _mm256_mullo_epi32( mul[3], *tbl ); + tbl = (__m256i*)&( fftTable[ input[4] << 3 ] ); + F[4] = _mm256_mullo_epi32( mul[4], *tbl ); + tbl = (__m256i*)&( fftTable[ input[5] << 3 ] ); + F[5] = _mm256_mullo_epi32( mul[5], *tbl ); + tbl = (__m256i*)&( fftTable[ input[6] << 3 ] ); + F[6] = _mm256_mullo_epi32( mul[6], *tbl ); + tbl = (__m256i*)&( fftTable[ input[7] << 3 ] ); + F[7] = _mm256_mullo_epi32( mul[7], *tbl ); + + #define ADD_SUB( a, b ) \ + { \ + __m256i tmp = b; \ + b = _mm256_sub_epi32( a, b ); \ + a = _mm256_add_epi32( a, tmp ); \ + } + + ADD_SUB( F[0], F[1] ); + ADD_SUB( F[2], F[3] ); + ADD_SUB( F[4], F[5] ); + ADD_SUB( F[6], F[7] ); + + F[3] = _mm256_slli_epi32( F[3], 4 ); + F[7] = _mm256_slli_epi32( F[7], 4 ); + + ADD_SUB( F[0], F[2] ); + ADD_SUB( F[1], F[3] ); + ADD_SUB( F[4], F[6] ); + ADD_SUB( F[5], F[7] ); + + F[5] = _mm256_slli_epi32( F[5], 2 ); + F[6] = _mm256_slli_epi32( F[6], 4 ); + F[7] = _mm256_slli_epi32( F[7], 6 ); + + ADD_SUB( F[0], F[4] ); + ADD_SUB( F[1], F[5] ); + ADD_SUB( F[2], F[6] ); + ADD_SUB( F[3], F[7] ); + + #undef ADD_SUB + +#if defined (__AVX512VL__) && defined(__AVX512BW__) + + #define Q_REDUCE( a ) \ + _mm256_sub_epi32( _mm256_and_si256( a, \ + _mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) ) + +#else + + #define Q_REDUCE( a ) \ + _mm256_sub_epi32( _mm256_and_si256( a, \ + m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) ) + +#endif + + out[0] = Q_REDUCE( F[0] ); + out[1] = Q_REDUCE( F[1] ); + out[2] = Q_REDUCE( F[2] ); + out[3] = Q_REDUCE( F[3] ); + out[4] = Q_REDUCE( F[4] ); + out[5] = Q_REDUCE( F[5] ); + out[6] = Q_REDUCE( F[6] ); + out[7] = Q_REDUCE( F[7] ); + + #undef Q_REDUCE + +#elif defined(__SSE4_1__) + + __m128i F[16] __attribute__ ((aligned (64))); + __m128i *mul = (__m128i*)multipliers; + __m128i *out = (__m128i*)output; + __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] ); + + F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] ); + F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[1] << 3 ] ); + F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] ); + F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[2] << 3 ] ); + F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] ); + F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[3] << 3 ] ); + F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] ); + F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[4] << 3 ] ); + F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] ); + F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[5] << 3 ] ); + F[10] = _mm_mullo_epi32( mul[10], tbl[0] ); + F[11] = _mm_mullo_epi32( mul[11], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[6] << 3 ] ); + F[12] = _mm_mullo_epi32( mul[12], tbl[0] ); + F[13] = _mm_mullo_epi32( mul[13], tbl[1] ); + tbl = (__m128i*)&( fftTable[ input[7] << 3 ] ); + F[14] = _mm_mullo_epi32( mul[14], tbl[0] ); + F[15] = _mm_mullo_epi32( mul[15], tbl[1] ); + + #define ADD_SUB( a, b ) \ + { \ + __m128i tmp = b; \ + b = _mm_sub_epi32( a, b ); \ + a = _mm_add_epi32( a, tmp ); \ + } + + ADD_SUB( F[ 0], F[ 2] ); + ADD_SUB( F[ 1], F[ 3] ); + ADD_SUB( F[ 4], F[ 6] ); + ADD_SUB( F[ 5], F[ 7] ); + ADD_SUB( F[ 8], F[10] ); + ADD_SUB( F[ 9], F[11] ); + ADD_SUB( F[12], F[14] ); + ADD_SUB( F[13], F[15] ); + + F[ 6] = _mm_slli_epi32( F[ 6], 4 ); + F[ 7] = _mm_slli_epi32( F[ 7], 4 ); + F[14] = _mm_slli_epi32( F[14], 4 ); + F[15] = _mm_slli_epi32( F[15], 4 ); + + ADD_SUB( F[ 0], F[ 4] ); + ADD_SUB( F[ 1], F[ 5] ); + ADD_SUB( F[ 2], F[ 6] ); + ADD_SUB( F[ 3], F[ 7] ); + ADD_SUB( F[ 8], F[12] ); + ADD_SUB( F[ 9], F[13] ); + ADD_SUB( F[10], F[14] ); + ADD_SUB( F[11], F[15] ); + + F[10] = _mm_slli_epi32( F[10], 2 ); + F[11] = _mm_slli_epi32( F[11], 2 ); + F[12] = _mm_slli_epi32( F[12], 4 ); + F[13] = _mm_slli_epi32( F[13], 4 ); + F[14] = _mm_slli_epi32( F[14], 6 ); + F[15] = _mm_slli_epi32( F[15], 6 ); + + ADD_SUB( F[ 0], F[ 8] ); + ADD_SUB( F[ 1], F[ 9] ); + ADD_SUB( F[ 2], F[10] ); + ADD_SUB( F[ 3], F[11] ); + ADD_SUB( F[ 4], F[12] ); + ADD_SUB( F[ 5], F[13] ); + ADD_SUB( F[ 6], F[14] ); + ADD_SUB( F[ 7], F[15] ); + + #undef ADD_SUB + + #define Q_REDUCE( a ) \ + _mm_sub_epi32( _mm_and_si128( a, \ + m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) ) + + out[ 0] = Q_REDUCE( F[ 0] ); + out[ 1] = Q_REDUCE( F[ 1] ); + out[ 2] = Q_REDUCE( F[ 2] ); + out[ 3] = Q_REDUCE( F[ 3] ); + out[ 4] = Q_REDUCE( F[ 4] ); + out[ 5] = Q_REDUCE( F[ 5] ); + out[ 6] = Q_REDUCE( F[ 6] ); + out[ 7] = Q_REDUCE( F[ 7] ); + out[ 8] = Q_REDUCE( F[ 8] ); + out[ 9] = Q_REDUCE( F[ 9] ); + out[10] = Q_REDUCE( F[10] ); + out[11] = Q_REDUCE( F[11] ); + out[12] = Q_REDUCE( F[12] ); + out[13] = Q_REDUCE( F[13] ); + out[14] = Q_REDUCE( F[14] ); + out[15] = Q_REDUCE( F[15] ); + + #undef Q_REDUCE + +#else // < SSE4.1 + + swift_int16_t *mult = multipliers; + + // First loop unrolling: + register swift_int16_t *table = &(fftTable[input[0] << 3]); /* swift_int32_t F[64]; @@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) F50, F51, F52, F53, F54, F55, F56, F57, F58, F59, F60, F61, F62, F63; - // First loop unrolling: - register swift_int16_t *table = &(fftTable[input[0] << 3]); - - F0 = mult[0] * table[0]; - F8 = mult[1] * table[1]; + F0 = mult[0] * table[0]; + F8 = mult[1] * table[1]; F16 = mult[2] * table[2]; F24 = mult[3] * table[3]; F32 = mult[4] * table[4]; @@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) F48 = mult[6] * table[6]; F56 = mult[7] * table[7]; - mult += 8; table = &(fftTable[input[1] << 3]); - F1 = mult[0] * table[0]; - F9 = mult[1] * table[1]; - F17 = mult[2] * table[2]; - F25 = mult[3] * table[3]; - F33 = mult[4] * table[4]; - F41 = mult[5] * table[5]; - F49 = mult[6] * table[6]; - F57 = mult[7] * table[7]; + F1 = mult[ 8] * table[0]; + F9 = mult[ 9] * table[1]; + F17 = mult[10] * table[2]; + F25 = mult[11] * table[3]; + F33 = mult[12] * table[4]; + F41 = mult[13] * table[5]; + F49 = mult[14] * table[6]; + F57 = mult[15] * table[7]; - mult += 8; table = &(fftTable[input[2] << 3]); - F2 = mult[0] * table[0]; - F10 = mult[1] * table[1]; - F18 = mult[2] * table[2]; - F26 = mult[3] * table[3]; - F34 = mult[4] * table[4]; - F42 = mult[5] * table[5]; - F50 = mult[6] * table[6]; - F58 = mult[7] * table[7]; + F2 = mult[16] * table[0]; + F10 = mult[17] * table[1]; + F18 = mult[18] * table[2]; + F26 = mult[19] * table[3]; + F34 = mult[20] * table[4]; + F42 = mult[21] * table[5]; + F50 = mult[22] * table[6]; + F58 = mult[23] * table[7]; - mult += 8; table = &(fftTable[input[3] << 3]); - F3 = mult[0] * table[0]; - F11 = mult[1] * table[1]; - F19 = mult[2] * table[2]; - F27 = mult[3] * table[3]; - F35 = mult[4] * table[4]; - F43 = mult[5] * table[5]; - F51 = mult[6] * table[6]; - F59 = mult[7] * table[7]; + F3 = mult[24] * table[0]; + F11 = mult[25] * table[1]; + F19 = mult[26] * table[2]; + F27 = mult[27] * table[3]; + F35 = mult[28] * table[4]; + F43 = mult[29] * table[5]; + F51 = mult[30] * table[6]; + F59 = mult[31] * table[7]; - mult += 8; table = &(fftTable[input[4] << 3]); - F4 = mult[0] * table[0]; - F12 = mult[1] * table[1]; - F20 = mult[2] * table[2]; - F28 = mult[3] * table[3]; - F36 = mult[4] * table[4]; - F44 = mult[5] * table[5]; - F52 = mult[6] * table[6]; - F60 = mult[7] * table[7]; + F4 = mult[32] * table[0]; + F12 = mult[33] * table[1]; + F20 = mult[34] * table[2]; + F28 = mult[35] * table[3]; + F36 = mult[36] * table[4]; + F44 = mult[37] * table[5]; + F52 = mult[38] * table[6]; + F60 = mult[39] * table[7]; - mult += 8; table = &(fftTable[input[5] << 3]); - F5 = mult[0] * table[0]; - F13 = mult[1] * table[1]; - F21 = mult[2] * table[2]; - F29 = mult[3] * table[3]; - F37 = mult[4] * table[4]; - F45 = mult[5] * table[5]; - F53 = mult[6] * table[6]; - F61 = mult[7] * table[7]; + F5 = mult[40] * table[0]; + F13 = mult[41] * table[1]; + F21 = mult[42] * table[2]; + F29 = mult[43] * table[3]; + F37 = mult[44] * table[4]; + F45 = mult[45] * table[5]; + F53 = mult[46] * table[6]; + F61 = mult[47] * table[7]; - mult += 8; table = &(fftTable[input[6] << 3]); - F6 = mult[0] * table[0]; - F14 = mult[1] * table[1]; - F22 = mult[2] * table[2]; - F30 = mult[3] * table[3]; - F38 = mult[4] * table[4]; - F46 = mult[5] * table[5]; - F54 = mult[6] * table[6]; - F62 = mult[7] * table[7]; + F6 = mult[48] * table[0]; + F14 = mult[49] * table[1]; + F22 = mult[50] * table[2]; + F30 = mult[51] * table[3]; + F38 = mult[52] * table[4]; + F46 = mult[53] * table[5]; + F54 = mult[54] * table[6]; + F62 = mult[55] * table[7]; - mult += 8; table = &(fftTable[input[7] << 3]); - F7 = mult[0] * table[0]; - F15 = mult[1] * table[1]; - F23 = mult[2] * table[2]; - F31 = mult[3] * table[3]; - F39 = mult[4] * table[4]; - F47 = mult[5] * table[5]; - F55 = mult[6] * table[6]; - F63 = mult[7] * table[7]; - + F7 = mult[56] * table[0]; + F15 = mult[57] * table[1]; + F23 = mult[58] * table[2]; + F31 = mult[59] * table[3]; + F39 = mult[60] * table[4]; + F47 = mult[61] * table[5]; + F55 = mult[62] * table[6]; + F63 = mult[63] * table[7]; + + #define ADD_SUB( a, b ) \ + { \ + int temp = b; \ + b = a - b; \ + a = a + temp; \ + } + + #define Q_REDUCE( a ) \ + ( ( (a) & 0xff ) - ( (a) >> 8 ) ) + /* for ( int i = 0; i < 8; i++ ) @@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) } */ - // Second loop unrolling: // Iteration 0: ADD_SUB(F0, F1); @@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output) output[47] = Q_REDUCE(F61); output[55] = Q_REDUCE(F62); output[63] = Q_REDUCE(F63); + + #undef ADD_SUB + #undef Q_REDUCE + +#endif // AVX2 elif SSE4.1 else } // Calculates the FFT part of SWIFFT. @@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output) // - m: the input size divided by 64. // - output: will store the result. // - a: the coefficients in the sum. Of size 64 * m. -void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a) +void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output, + const swift_int16_t *a ) { int i, j; - swift_int32_t result[N]; + swift_int32_t result[N] __attribute__ ((aligned (64))); register swift_int16_t carry = 0; +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) + + __m512i *res = (__m512i*)result; + for ( j = 0; j < N/16; ++j ) + { + __m512i sum = _mm512_setzero_si512(); + const __m512i *f = (__m512i*)input + j; + const __m512i *k = (__m512i*)a + j; + for ( i = 0; i < m; i++, f += N/16, k += N/16 ) + sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) ); + res[j] = sum; + } + +#elif defined(__AVX2__) + + __m256i *res = (__m256i*)result; + for ( j = 0; j < N/8; ++j ) + { + __m256i sum = _mm256_setzero_si256(); + const __m256i *f = (__m256i*)input + j; + const __m256i *k = (__m256i*)a + j; + for ( i = 0; i < m; i++, f += N/8, k += N/8 ) + sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) ); + res[j] = sum; + } + +#elif defined(__SSE4_1__) + + __m128i *res = (__m128i*)result; + for ( j = 0; j < N/4; ++j ) + { + __m128i sum = _mm_setzero_si128(); + const __m128i *f = (__m128i*)input + j; + const __m128i *k = (__m128i*)a + j; + for ( i = 0; i < m; i++, f += N/4, k += N/4 ) + sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) ); + res[j] = sum; + } + +#else + for (j = 0; j < N; ++j) { register swift_int32_t sum = 0; const register swift_int32_t *f = input + j; const register swift_int16_t *k = a + j; - for (i = 0; i < m; i++, f += N,k += N) sum += (*f) * (*k); - result[j] = sum; } +#endif + for (j = 0; j < N; ++j) result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE; @@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], { int i; // Will store the result of the FFT parts: - swift_int32_t fftOut[N * M]; - unsigned char intermediate[N * 3 + 8]; + swift_int32_t fftOut[N * M] __attribute__ ((aligned (64))); + unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64))); unsigned char carry0,carry1,carry2; // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets @@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE], { int i; // Will store the result of the FFT parts: - swift_int32_t fftOut[N * M]; - unsigned char intermediate[N * 3 + 8]; + swift_int32_t fftOut[N * M] __attribute__ ((aligned (64))); + unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64))); unsigned char carry0,carry1,carry2; // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets diff --git a/configure b/configure index db3efc9f..ae0d7bec 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.18.0' -PACKAGE_STRING='cpuminer-opt 3.18.0' +PACKAGE_VERSION='3.18.1' +PACKAGE_STRING='cpuminer-opt 3.18.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.18.0 +cpuminer-opt configure 3.18.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.18.0, which was +It was created by cpuminer-opt $as_me 3.18.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.18.0' + VERSION='3.18.1' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.18.0, which was +This file was extended by cpuminer-opt $as_me 3.18.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.18.0 +cpuminer-opt config.status 3.18.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index fbe5a9b0..869b3669 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.18.0]) +AC_INIT([cpuminer-opt], [3.18.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index c8895381..2a63729e 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2083,7 +2083,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) / ( opt_target_factor * opt_diff_factor ); diff_to_hash( g_work->target, g_work->targetdiff ); - // Increment extranonce2 + // Pre increment extranonce2 in case of being called again before receiving + // a new job for ( int t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); @@ -2103,20 +2104,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) pthread_mutex_unlock( &stats_lock ); - if ( !opt_quiet ) - { - int mismatch = submitted_share_count - - ( accepted_share_count + stale_share_count + rejected_share_count ); - if ( mismatch ) - applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count ); - } - if ( stratum_diff != sctx->job.diff ) applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s", sctx->job.diff, sctx->block_height, g_work->job_id ); else if ( last_block_height != sctx->block_height ) - applog( LOG_BLUE, "New Block %d, Job %s", - sctx->block_height, g_work->job_id ); + applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s", + sctx->block_height, net_diff, g_work->job_id ); else if ( g_work->job_id && new_job ) applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s", sctx->block_height, net_diff, g_work->job_id ); @@ -2173,7 +2166,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) { double net_hr = nd / net_ttf; char net_hr_units[4] = {0}; - scale_hash_for_display ( &net_hr, net_hr_units ); applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s", net_hr, net_hr_units ); @@ -2182,6 +2174,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) } // hr > 0 } // !quiet } // new diff/block + + if ( new_job && !opt_quiet ) + { + int mismatch = submitted_share_count - ( accepted_share_count + + stale_share_count + + rejected_share_count ); + if ( mismatch ) + applog( LOG_INFO, + CL_LBL "%d Submitted share pending, maybe stale" CL_N, + submitted_share_count ); + } } static void *miner_thread( void *userdata ) @@ -3970,6 +3973,7 @@ int main(int argc, char *argv[]) gettimeofday( &last_submit_time, NULL ); memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) ); memcpy( &session_start, &last_submit_time, sizeof (struct timeval) ); + memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) ); pthread_mutex_unlock( &stats_lock ); applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm", diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 3d840107..1116976f 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -1,7 +1,7 @@ #if !defined(SIMD_256_H__) #define SIMD_256_H__ 1 -#if defined(__AVX2__) +//#if defined(__AVX2__) ///////////////////////////////////////////////////////////////////// // @@ -14,7 +14,9 @@ // is limited because 256 bit vectors are less likely to be used when 512 // is available. -// Used instead if casting. +#if defined(__AVX__) + +// Used instead of casting. typedef union { __m256i m256; @@ -23,6 +25,28 @@ typedef union uint32_t u32[8]; } __attribute__ ((aligned (32))) m256_ovly; +// +// Pointer casting + +// p = any aligned pointer +// returns p as pointer to vector type, not very useful +#define castp_m256i(p) ((__m256i*)(p)) + +// p = any aligned pointer +// returns *p, watch your pointer arithmetic +#define cast_m256i(p) (*((__m256i*)(p))) + +// p = any aligned pointer, i = scaled array index +// returns value p[i] +#define casti_m256i(p,i) (((__m256i*)(p))[(i)]) + +// p = any aligned pointer, o = scaled offset +// returns pointer p+o +#define casto_m256i(p,o) (((__m256i*)(p))+(o)) + +#endif +#if defined(__AVX2__) + // Move integer to low element of vector, other elements are set to zero. #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) ) @@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn() #define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v ) #define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 ) -// -// Pointer casting - -// p = any aligned pointer -// returns p as pointer to vector type, not very useful -#define castp_m256i(p) ((__m256i*)(p)) - -// p = any aligned pointer -// returns *p, watch your pointer arithmetic -#define cast_m256i(p) (*((__m256i*)(p))) - -// p = any aligned pointer, i = scaled array index -// returns value p[i] -#define casti_m256i(p,i) (((__m256i*)(p))[(i)]) - -// p = any aligned pointer, o = scaled offset -// returns pointer p+o -#define casto_m256i(p,o) (((__m256i*)(p))+(o)) - - // // Memory functions // n = number of 256 bit (32 byte) vectors diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index de948cc4..3cc090a4 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n ) // Rotate 256 bit lanes by one 64 bit element #define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 ) - #define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 ) // Rotate 256 bit lanes by one 32 bit element @@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c ) // shufl2r is 2 input ... // Drop macros? They can easilly be rebuilt using shufl2 functions -// add shuflr shufll functions performing rotate, returning first arg -// They're faster than doing both, when both not needed. - // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return // rotated v1 // visually confusing for shif2r because of arg order. First arg is always