diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 6f77267f..3fd95f1a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,18 @@ If not what makes it happen or not happen? Change Log ---------- +v3.19.8 + +#370 "stratum+ssl", in addition to "stratum+tcps", is now recognized as a valid +url protocol specifier for requesting a secure stratum connection. + +The full url, including the protocol, is now displayed in the stratum connect +log and the periodic summary log. + +Small optimizations to Cubehash, AVX2 & AVX512. + +Byte order and prehash optimizations for blake256 & blake512, AVX2 & AVX512. + v3.19.7 #369 Fixed time limited mining, --time-limit. diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index a5d74e0a..31a553b1 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -98,6 +98,12 @@ typedef blake_8way_small_context blake256_8way_context; void blake256_8way_init(void *cc); void blake256_8way_update(void *cc, const void *data, size_t len); void blake256_8way_close(void *cc, void *dst); +void blake256_8way_update_le(void *cc, const void *data, size_t len); +void blake256_8way_close_le(void *cc, void *dst); +void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, + const void *data ); +void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, + const void *midhash, const void *data ); // 14 rounds, blake, decred typedef blake_8way_small_context blake256r14_8way_context; @@ -128,6 +134,12 @@ void blake512_4way_update( void *cc, const void *data, size_t len ); void blake512_4way_close( void *cc, void *dst ); void blake512_4way_full( blake_4way_big_context *sc, void * dst, const void *data, size_t len ); +void blake512_4way_full_le( blake_4way_big_context *sc, void * dst, + const void *data, size_t len ); +void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate, + const void *data ); +void blake512_4way_final_le( blake_4way_big_context *sc, void *hash, + const __m256i nonce, const __m256i *midstate ); #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -148,6 +160,14 @@ typedef blake_16way_small_context blake256_16way_context; void blake256_16way_init(void *cc); void blake256_16way_update(void *cc, const void *data, size_t len); void blake256_16way_close(void *cc, void *dst); +// Expects data in little endian order, no byte swap needed +void blake256_16way_update_le(void *cc, const void *data, size_t len); +void blake256_16way_close_le(void *cc, void *dst); +void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, + const void *data ); +void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, + const void *midhash, const void *data ); + // 14 rounds, blake, decred typedef blake_16way_small_context blake256r14_16way_context; @@ -180,7 +200,12 @@ void blake512_8way_update( void *cc, const void *data, size_t len ); void blake512_8way_close( void *cc, void *dst ); void blake512_8way_full( blake_8way_big_context *sc, void * dst, const void *data, size_t len ); -void blake512_8way_hash_le80( void *hash, const void *data ); +void blake512_8way_full_le( blake_8way_big_context *sc, void * dst, + const void *data, size_t len ); +void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate, + const void *data ); +void blake512_8way_final_le( blake_8way_big_context *sc, void *hash, + const __m512i nonce, const __m512i *midstate ); #endif // AVX512 #endif // AVX2 diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index 65fbe1fa..4280e3d5 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -508,14 +508,10 @@ do { \ V9 = m128_const1_64( 0x85A308D385A308D3 ); \ VA = m128_const1_64( 0x13198A2E13198A2E ); \ VB = m128_const1_64( 0x0370734403707344 ); \ - VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \ - m128_const1_64( 0xA4093822A4093822 ) ); \ - VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \ - m128_const1_64( 0x299F31D0299F31D0 ) ); \ - VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \ - m128_const1_64( 0x082EFA98082EFA98 ) ); \ - VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \ - m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \ + VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \ + VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \ + VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \ + VF = _mm_set1_epi32( T1 ^ 0xEC4E6C89 ); \ BLAKE256_4WAY_BLOCK_BSWAP32; \ ROUND_S_4WAY(0); \ ROUND_S_4WAY(1); \ @@ -626,14 +622,10 @@ do { \ V9 = m256_const1_64( 0x85A308D385A308D3 ); \ VA = m256_const1_64( 0x13198A2E13198A2E ); \ VB = m256_const1_64( 0x0370734403707344 ); \ - VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\ - m256_const1_64( 0xA4093822A4093822 ) ); \ - VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\ - m256_const1_64( 0x299F31D0299F31D0 ) ); \ - VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ - m256_const1_64( 0x082EFA98082EFA98 ) ); \ - VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ - m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \ + VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \ + VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \ + VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \ + VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \ shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \ 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \ @@ -679,13 +671,220 @@ do { \ H7 = mm256_xor3( VF, V7, H7 ); \ } while (0) +#define COMPRESS32_8WAY_LE( rounds ) \ +do { \ + __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ + __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ + __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ + __m256i V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = m256_const1_64( 0x243F6A88243F6A88 ); \ + V9 = m256_const1_64( 0x85A308D385A308D3 ); \ + VA = m256_const1_64( 0x13198A2E13198A2E ); \ + VB = m256_const1_64( 0x0370734403707344 ); \ + VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \ + VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \ + VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \ + VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \ + M0 = buf[ 0]; \ + M1 = buf[ 1]; \ + M2 = buf[ 2]; \ + M3 = buf[ 3]; \ + M4 = buf[ 4]; \ + M5 = buf[ 5]; \ + M6 = buf[ 6]; \ + M7 = buf[ 7]; \ + M8 = buf[ 8]; \ + M9 = buf[ 9]; \ + MA = buf[10]; \ + MB = buf[11]; \ + MC = buf[12]; \ + MD = buf[13]; \ + ME = buf[14]; \ + MF = buf[15]; \ + ROUND_S_8WAY(0); \ + ROUND_S_8WAY(1); \ + ROUND_S_8WAY(2); \ + ROUND_S_8WAY(3); \ + ROUND_S_8WAY(4); \ + ROUND_S_8WAY(5); \ + ROUND_S_8WAY(6); \ + ROUND_S_8WAY(7); \ + if (rounds == 14) \ + { \ + ROUND_S_8WAY(8); \ + ROUND_S_8WAY(9); \ + ROUND_S_8WAY(0); \ + ROUND_S_8WAY(1); \ + ROUND_S_8WAY(2); \ + ROUND_S_8WAY(3); \ + } \ + H0 = mm256_xor3( V8, V0, H0 ); \ + H1 = mm256_xor3( V9, V1, H1 ); \ + H2 = mm256_xor3( VA, V2, H2 ); \ + H3 = mm256_xor3( VB, V3, H3 ); \ + H4 = mm256_xor3( VC, V4, H4 ); \ + H5 = mm256_xor3( VD, V5, H5 ); \ + H6 = mm256_xor3( VE, V6, H6 ); \ + H7 = mm256_xor3( VF, V7, H7 ); \ +} while (0) + +void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, + const void *data ) +{ + const __m256i *M = (const __m256i*)data; + __m256i *V = (__m256i*)midstate; + const __m256i *H = (const __m256i*)midhash; + + V[ 0] = H[0]; + V[ 1] = H[1]; + V[ 2] = H[2]; + V[ 3] = H[3]; + V[ 4] = H[4]; + V[ 5] = H[5]; + V[ 6] = H[6]; + V[ 7] = H[7]; + V[ 8] = m256_const1_32( CS0 ); + V[ 9] = m256_const1_32( CS1 ); + V[10] = m256_const1_32( CS2 ); + V[11] = m256_const1_32( CS3 ); + V[12] = m256_const1_32( CS4 ^ 0x280 ); + V[13] = m256_const1_32( CS5 ^ 0x280 ); + V[14] = m256_const1_32( CS6 ); + V[15] = m256_const1_32( CS7 ); + + // G0 + GS_8WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] ); + + // G1 + V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ), + _mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) ); + V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 ); + V[ 9] = _mm256_add_epi32( V[ 9], V[13] ); + V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 ); + V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] ); + + // G2,G3 + GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] ); + GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] ); + + // G4 + V[ 0] = _mm256_add_epi32( V[ 0], + _mm256_xor_si256( _mm256_set1_epi32( CS9 ), M[ 8] ) ); +} + +void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, + const void *midhash, const void *data ) +{ + __m256i *H = (__m256i*)final_hash; + const __m256i *h = (const __m256i*)midhash; + const __m256i *v= (const __m256i*)midstate; + __m256i V0, V1, V2, V3, V4, V5, V6, V7; + __m256i V8, V9, VA, VB, VC, VD, VE, VF; + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i M8, M9, MA, MB, MC, MD, ME, MF; + + V0 = v[ 0]; + V1 = v[ 1]; + V2 = v[ 2]; + V3 = v[ 3]; + V4 = v[ 4]; + V5 = v[ 5]; + V6 = v[ 6]; + V7 = v[ 7]; + V8 = v[ 8]; + V9 = v[ 9]; + VA = v[10]; + VB = v[11]; + VC = v[12]; + VD = v[13]; + VE = v[14]; + VF = v[15]; + + M0 = casti_m256i( data, 0 ); + M1 = casti_m256i( data, 1 ); + M2 = casti_m256i( data, 2 ); + M3 = casti_m256i( data, 3 ); + M4 = casti_m256i( data, 4 ); + M5 = casti_m256i( data, 5 ); + M6 = casti_m256i( data, 6 ); + M7 = casti_m256i( data, 7 ); + M8 = casti_m256i( data, 8 ); + M9 = casti_m256i( data, 9 ); + MA = casti_m256i( data, 10 ); + MB = casti_m256i( data, 11 ); + MC = casti_m256i( data, 12 ); + MD = casti_m256i( data, 13 ); + ME = casti_m256i( data, 14 ); + MF = casti_m256i( data, 15 ); + + // Finish round 0 + // G1 + V1 = _mm256_add_epi32( V1, + _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) ); + VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 ); + V9 = _mm256_add_epi32( V9, VD ); + V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 ); + + // G4 + V0 = _mm256_add_epi32( V0, V5 ); + VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 ); + VA = _mm256_add_epi32( VA, VF ); + V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 ); + V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5, + _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) ); + VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 ); + VA = _mm256_add_epi32( VA, VF ); + V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 ); + + // G5,G6,G7 + GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); + GS_8WAY( MC, MD, CSC, CSD, V2, V7, V8, VD ); + GS_8WAY( ME, MF, CSE, CSF, V3, V4, V9, VE ); + + // Remaining rounds + ROUND_S_8WAY( 1 ); + ROUND_S_8WAY( 2 ); + ROUND_S_8WAY( 3 ); + ROUND_S_8WAY( 4 ); + ROUND_S_8WAY( 5 ); + ROUND_S_8WAY( 6 ); + ROUND_S_8WAY( 7 ); + ROUND_S_8WAY( 8 ); + ROUND_S_8WAY( 9 ); + ROUND_S_8WAY( 0 ); + ROUND_S_8WAY( 1 ); + ROUND_S_8WAY( 2 ); + ROUND_S_8WAY( 3 ); + + const __m256i shuf_bswap32 = + m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + + H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 ); + H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 ); + H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 ); + H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 ); + H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 ); + H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 ); + H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 ); + H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 ); +} + #endif #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -// Blaske-256 16 way AVX512 +// Blake-256 16 way AVX512 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \ do { \ @@ -763,14 +962,10 @@ do { \ V9 = m512_const1_64( 0x85A308D385A308D3 ); \ VA = m512_const1_64( 0x13198A2E13198A2E ); \ VB = m512_const1_64( 0x0370734403707344 ); \ - VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\ - m512_const1_64( 0xA4093822A4093822 ) ); \ - VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\ - m512_const1_64( 0x299F31D0299F31D0 ) ); \ - VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \ - m512_const1_64( 0x082EFA98082EFA98 ) ); \ - VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \ - m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \ + VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \ + VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \ + VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \ + VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \ shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \ 0x2c2d2e2f28292a2b, 0x2425262720212223, \ 0x1c1d1e1f18191a1b, 0x1415161710111213, \ @@ -818,6 +1013,239 @@ do { \ H7 = mm512_xor3( VF, V7, H7 ); \ } while (0) +#define COMPRESS32_16WAY_LE( rounds ) \ +do { \ + __m512i M0, M1, M2, M3, M4, M5, M6, M7; \ + __m512i M8, M9, MA, MB, MC, MD, ME, MF; \ + __m512i V0, V1, V2, V3, V4, V5, V6, V7; \ + __m512i V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = m512_const1_64( 0x243F6A88243F6A88 ); \ + V9 = m512_const1_64( 0x85A308D385A308D3 ); \ + VA = m512_const1_64( 0x13198A2E13198A2E ); \ + VB = m512_const1_64( 0x0370734403707344 ); \ + VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \ + VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \ + VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \ + VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \ + M0 = buf[ 0]; \ + M1 = buf[ 1]; \ + M2 = buf[ 2]; \ + M3 = buf[ 3]; \ + M4 = buf[ 4]; \ + M5 = buf[ 5]; \ + M6 = buf[ 6]; \ + M7 = buf[ 7]; \ + M8 = buf[ 8]; \ + M9 = buf[ 9]; \ + MA = buf[10]; \ + MB = buf[11]; \ + MC = buf[12]; \ + MD = buf[13]; \ + ME = buf[14]; \ + MF = buf[15]; \ + ROUND_S_16WAY(0); \ + ROUND_S_16WAY(1); \ + ROUND_S_16WAY(2); \ + ROUND_S_16WAY(3); \ + ROUND_S_16WAY(4); \ + ROUND_S_16WAY(5); \ + ROUND_S_16WAY(6); \ + ROUND_S_16WAY(7); \ + if (rounds == 14) \ + { \ + ROUND_S_16WAY(8); \ + ROUND_S_16WAY(9); \ + ROUND_S_16WAY(0); \ + ROUND_S_16WAY(1); \ + ROUND_S_16WAY(2); \ + ROUND_S_16WAY(3); \ + } \ + H0 = mm512_xor3( V8, V0, H0 ); \ + H1 = mm512_xor3( V9, V1, H1 ); \ + H2 = mm512_xor3( VA, V2, H2 ); \ + H3 = mm512_xor3( VB, V3, H3 ); \ + H4 = mm512_xor3( VC, V4, H4 ); \ + H5 = mm512_xor3( VD, V5, H5 ); \ + H6 = mm512_xor3( VE, V6, H6 ); \ + H7 = mm512_xor3( VF, V7, H7 ); \ +} while (0) + + +// data points to a prefilled final block containing the last 16 bytes of the +// blockheader plus padding. midhash is the hash from the first block. +// Prehash as much as possible without the nonce. +void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, + const void *data ) +{ + const __m512i *M = (const __m512i*)data; + __m512i *V = (__m512i*)midstate; + const __m512i *H = (const __m512i*)midhash; + + V[ 0] = H[0]; + V[ 1] = H[1]; + V[ 2] = H[2]; + V[ 3] = H[3]; + V[ 4] = H[4]; + V[ 5] = H[5]; + V[ 6] = H[6]; + V[ 7] = H[7]; + V[ 8] = m512_const1_32( CS0 ); + V[ 9] = m512_const1_32( CS1 ); + V[10] = m512_const1_32( CS2 ); + V[11] = m512_const1_32( CS3 ); + V[12] = m512_const1_32( CS4 ^ 0x280 ); + V[13] = m512_const1_32( CS5 ^ 0x280 ); + V[14] = m512_const1_32( CS6 ); + V[15] = m512_const1_32( CS7 ); + +// G0 + GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] ); + +// G1 +// GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); + V[ 1] = _mm512_add_epi32( _mm512_add_epi32( V[ 1], V[ 5] ), + _mm512_xor_si512( _mm512_set1_epi32( CS3 ), M[ 2] ) ); + V[13] = mm512_ror_32( _mm512_xor_si512( V[13], V[ 1] ), 16 ); + V[ 9] = _mm512_add_epi32( V[ 9], V[13] ); + V[ 5] = mm512_ror_32( _mm512_xor_si512( V[ 5], V[ 9] ), 12 ); + V[ 1] = _mm512_add_epi32( V[ 1], V[ 5] ); + + +// G2,G3 + GS_16WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] ); + GS_16WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] ); + +// G4 +// GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); + V[ 0] = _mm512_add_epi32( V[ 0], + _mm512_xor_si512( _mm512_set1_epi32( CS9 ), M[ 8] ) ); + +// G5,G6,G7 +// GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); +// GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); +// GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); + +} + +void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, + const void *midhash, const void *data ) +{ + __m512i *H = (__m512i*)final_hash; + const __m512i *h = (const __m512i*)midhash; + const __m512i *v= (const __m512i*)midstate; + __m512i V0, V1, V2, V3, V4, V5, V6, V7; + __m512i V8, V9, VA, VB, VC, VD, VE, VF; + __m512i M0, M1, M2, M3, M4, M5, M6, M7; + __m512i M8, M9, MA, MB, MC, MD, ME, MF; + + V0 = v[ 0]; + V1 = v[ 1]; + V2 = v[ 2]; + V3 = v[ 3]; + V4 = v[ 4]; + V5 = v[ 5]; + V6 = v[ 6]; + V7 = v[ 7]; + V8 = v[ 8]; + V9 = v[ 9]; + VA = v[10]; + VB = v[11]; + VC = v[12]; + VD = v[13]; + VE = v[14]; + VF = v[15]; + + M0 = casti_m512i( data, 0 ); + M1 = casti_m512i( data, 1 ); + M2 = casti_m512i( data, 2 ); + M3 = casti_m512i( data, 3 ); + M4 = casti_m512i( data, 4 ); + M5 = casti_m512i( data, 5 ); + M6 = casti_m512i( data, 6 ); + M7 = casti_m512i( data, 7 ); + M8 = casti_m512i( data, 8 ); + M9 = casti_m512i( data, 9 ); + MA = casti_m512i( data, 10 ); + MB = casti_m512i( data, 11 ); + MC = casti_m512i( data, 12 ); + MD = casti_m512i( data, 13 ); + ME = casti_m512i( data, 14 ); + MF = casti_m512i( data, 15 ); + + // Finish round 0 + // G0 + // GS_16WAY( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] ); + + // G1 + // GS_16WAY( M2, M3, CS2, CS3, V1, V5, V9, VD ); + + V1 = _mm512_add_epi32( V1, + _mm512_xor_si512( _mm512_set1_epi32( CS2 ), M3 ) ); + VD = mm512_ror_32( _mm512_xor_si512( VD, V1 ), 8 ); + V9 = _mm512_add_epi32( V9, VD ); + V5 = mm512_ror_32( _mm512_xor_si512( V5, V9 ), 7 ); + + // G2,G3 + // GS_16WAY( M4, M5, CS4, CS5, V2, V6, VA, VE ); + // GS_16WAY( M6, M7, CS6, CS7, V3, V7, VB, VF ); + + // G4 + // GS_16WAY( M8, M9, CS8, CS9, V0, V5, VA, VF ); + + V0 = _mm512_add_epi32( V0, V5 ); + VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 16 ); + VA = _mm512_add_epi32( VA, VF ); + V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 12 ); + V0 = _mm512_add_epi32( V0, _mm512_add_epi32( V5, + _mm512_xor_si512( _mm512_set1_epi32( CS8 ), M9 ) ) ); + VF = mm512_ror_32( _mm512_xor_si512( VF, V0 ), 8 ); + VA = _mm512_add_epi32( VA, VF ); + V5 = mm512_ror_32( _mm512_xor_si512( V5, VA ), 7 ); + + // G5,G6,G7 + GS_16WAY( MA, MB, CSA, CSB, V1, V6, VB, VC ); + GS_16WAY( MC, MD, CSC, CSD, V2, V7, V8, VD ); + GS_16WAY( ME, MF, CSE, CSF, V3, V4, V9, VE ); + + // Remaining rounds + ROUND_S_16WAY( 1 ); + ROUND_S_16WAY( 2 ); + ROUND_S_16WAY( 3 ); + ROUND_S_16WAY( 4 ); + ROUND_S_16WAY( 5 ); + ROUND_S_16WAY( 6 ); + ROUND_S_16WAY( 7 ); + ROUND_S_16WAY( 8 ); + ROUND_S_16WAY( 9 ); + ROUND_S_16WAY( 0 ); + ROUND_S_16WAY( 1 ); + ROUND_S_16WAY( 2 ); + ROUND_S_16WAY( 3 ); + + const __m512i shuf_bswap32 = + m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, + 0x2c2d2e2f28292a2b, 0x2425262720212223, + 0x1c1d1e1f18191a1b, 0x1415161710111213, + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + + H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 ); + H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 ); + H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 ); + H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 ); + H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 ); + H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 ); + H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 ); + H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 ); +} + #endif // Blake-256 4 way @@ -913,8 +1341,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, memset_zero_128( buf + vptr + 1, 13 - vptr ); buf[ 13 ] = _mm_or_si128( buf[ 13 ], m128_const1_64( 0x0100000001000000ULL ) ); - buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) ); - buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) ); + buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) ); + buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) ); blake32_4way( ctx, buf + vptr, 64 - ptr ); } else @@ -926,8 +1354,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, memset_zero_128( buf, 56>>2 ); buf[ 13 ] = _mm_or_si128( buf[ 13 ], m128_const1_64( 0x0100000001000000ULL ) ); - buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) ); - buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) ); + buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) ); + buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) ); blake32_4way( ctx, buf, 64 ); } @@ -1033,22 +1461,117 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, if ( out_size_w32 == 8 ) buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_const1_64( 0x0100000001000000ULL ) ); - *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); - *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); + *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) ); + *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) ); blake32_8way( sc, buf + (ptr>>2), 64 - ptr ); } else { - memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); - blake32_8way( sc, buf + (ptr>>2), 64 - ptr ); + memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); + blake32_8way( sc, buf + (ptr>>2), 64 - ptr ); + sc->T0 = SPH_C32(0xFFFFFE00UL); + sc->T1 = SPH_C32(0xFFFFFFFFUL); + memset_zero_256( buf, 56>>2 ); + if ( out_size_w32 == 8 ) + buf[52>>2] = m256_const1_64( 0x0100000001000000ULL ); + *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) ); + *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) ); + blake32_8way( sc, buf, 64 ); + } + mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H ); +} + +static void +blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len ) +{ + __m256i *vdata = (__m256i*)data; + __m256i *buf; + size_t ptr; + const int buf_size = 64; // number of elements, sizeof/4 + DECL_STATE32_8WAY + buf = sc->buf; + ptr = sc->ptr; + if ( len < buf_size - ptr ) + { + memcpy_256( buf + (ptr>>2), vdata, len>>2 ); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE32_8WAY(sc); + while ( len > 0 ) + { + size_t clen; + + clen = buf_size - ptr; + if (clen > len) + clen = len; + memcpy_256( buf + (ptr>>2), vdata, clen>>2 ); + ptr += clen; + vdata += (clen>>2); + len -= clen; + if ( ptr == buf_size ) + { + if ( ( T0 = SPH_T32(T0 + 512) ) < 512 ) + T1 = SPH_T32(T1 + 1); + COMPRESS32_8WAY_LE( sc->rounds ); + ptr = 0; + } + } + WRITE_STATE32_8WAY(sc); + sc->ptr = ptr; +} + +static void +blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32 ) +{ + __m256i buf[16]; + size_t ptr; + unsigned bit_len; + sph_u32 th, tl; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3); + buf[ptr>>2] = m256_const1_32( 0x80000000 ); + tl = sc->T0 + bit_len; + th = sc->T1; + + if ( ptr == 0 ) + { sc->T0 = SPH_C32(0xFFFFFE00UL); sc->T1 = SPH_C32(0xFFFFFFFFUL); - memset_zero_256( buf, 56>>2 ); + } + else if ( sc->T0 == 0 ) + { + sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len; + sc->T1 = SPH_T32(sc->T1 - 1); + } + else + sc->T0 -= 512 - bit_len; + + if ( ptr <= 52 ) + { + memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 ); if ( out_size_w32 == 8 ) - buf[52>>2] = m256_const1_64( 0x0100000001000000ULL ); - *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); - *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); - blake32_8way( sc, buf, 64 ); + buf[52>>2] = _mm256_or_si256( buf[52>>2], m256_one_32 ); + *(buf+(56>>2)) = _mm256_set1_epi32( th ); + *(buf+(60>>2)) = _mm256_set1_epi32( tl ); + blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr ); + } + else + { + memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); + blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr ); + sc->T0 = SPH_C32(0xFFFFFE00UL); + sc->T1 = SPH_C32(0xFFFFFFFFUL); + memset_zero_256( buf, 56>>2 ); + if ( out_size_w32 == 8 ) + buf[52>>2] = m256_one_32; + *(buf+(56>>2)) = _mm256_set1_epi32( th ); + *(buf+(60>>2)) = _mm256_set1_epi32( tl ); + blake32_8way_le( sc, buf, 64 ); } mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H ); } @@ -1117,7 +1640,6 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len ) WRITE_STATE32_16WAY(sc); sc->ptr = ptr; } - static void blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32 ) @@ -1152,22 +1674,116 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n, if ( out_size_w32 == 8 ) buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_const1_64( 0x0100000001000000ULL ) ); - buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) ); - buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) ); + buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) ); + buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) ); blake32_16way( sc, buf + (ptr>>2), 64 - ptr ); } else { - memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); - blake32_16way( sc, buf + (ptr>>2), 64 - ptr ); + memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); + blake32_16way( sc, buf + (ptr>>2), 64 - ptr ); + sc->T0 = 0xFFFFFE00UL; + sc->T1 = 0xFFFFFFFFUL; + memset_zero_512( buf, 56>>2 ); + if ( out_size_w32 == 8 ) + buf[52>>2] = m512_const1_64( 0x0100000001000000ULL ); + buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) ); + buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) ); + blake32_16way( sc, buf, 64 ); + } + mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H ); +} + +static void +blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len ) +{ + __m512i *vdata = (__m512i*)data; + __m512i *buf; + size_t ptr; + const int buf_size = 64; // number of elements, sizeof/4 + DECL_STATE32_16WAY + buf = sc->buf; + ptr = sc->ptr; + + // only if calling update with 80 + if ( len < buf_size - ptr ) + { + memcpy_512( buf + (ptr>>2), vdata, len>>2 ); + ptr += len; + sc->ptr = ptr; + return; + } + READ_STATE32_16WAY(sc); + while ( len > 0 ) + { + size_t clen; + + clen = buf_size - ptr; + if (clen > len) + clen = len; + memcpy_512( buf + (ptr>>2), vdata, clen>>2 ); + ptr += clen; + vdata += (clen>>2); + len -= clen; + if ( ptr == buf_size ) + { + if ( ( T0 = T0 + 512 ) < 512 ) + T1 = T1 + 1; + COMPRESS32_16WAY_LE( sc->rounds ); + ptr = 0; + } + } + WRITE_STATE32_16WAY(sc); + sc->ptr = ptr; +} + +static void +blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32 ) +{ + __m512i buf[16]; + size_t ptr; + unsigned bit_len; + sph_u32 th, tl; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3); + buf[ptr>>2] = m512_const1_32( 0x80000000 ); + tl = sc->T0 + bit_len; + th = sc->T1; + + if ( ptr == 0 ) + { sc->T0 = 0xFFFFFE00UL; sc->T1 = 0xFFFFFFFFUL; - memset_zero_512( buf, 56>>2 ); - if ( out_size_w32 == 8 ) - buf[52>>2] = m512_const1_64( 0x0100000001000000ULL ); - buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) ); - buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) ); - blake32_16way( sc, buf, 64 ); + } + else if ( sc->T0 == 0 ) + { + sc->T0 = 0xFFFFFE00UL + bit_len; + sc->T1 = sc->T1 - 1; + } + else + sc->T0 -= 512 - bit_len; + + if ( ptr <= 52 ) + { + memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 ); + buf[52>>2] = _mm512_or_si512( buf[52>>2], m512_one_32 ); + buf[56>>2] = _mm512_set1_epi32( th ); + buf[60>>2] = _mm512_set1_epi32( tl ); + blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr ); + } + else + { + memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 ); + blake32_16way_le( sc, buf + (ptr>>2), 64 - ptr ); + sc->T0 = 0xFFFFFE00UL; + sc->T1 = 0xFFFFFFFFUL; + memset_zero_512( buf, 56>>2 ); + buf[52>>2] = m512_one_32; + buf[56>>2] = _mm512_set1_epi32( th ); + buf[60>>2] = _mm512_set1_epi32( tl ); + blake32_16way_le( sc, buf, 64 ); } mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H ); } @@ -1190,6 +1806,18 @@ blake256_16way_close(void *cc, void *dst) blake32_16way_close(cc, 0, 0, dst, 8); } +void +blake256_16way_update_le(void *cc, const void *data, size_t len) +{ + blake32_16way_le(cc, data, len); +} + +void +blake256_16way_close_le(void *cc, void *dst) +{ + blake32_16way_close_le(cc, 0, 0, dst, 8); +} + void blake256r14_16way_init(void *cc) { blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 ); @@ -1271,6 +1899,18 @@ blake256_8way_close(void *cc, void *dst) blake32_8way_close(cc, 0, 0, dst, 8); } +void +blake256_8way_update_le(void *cc, const void *data, size_t len) +{ + blake32_8way_le(cc, data, len); +} + +void +blake256_8way_close_le(void *cc, void *dst) +{ + blake32_8way_close_le(cc, 0, 0, dst, 8); +} + #endif // 14 rounds Blake, Decred diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index d1b5d2bf..43ace33b 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -361,14 +361,10 @@ static const sph_u64 CB[16] = { V9 = m512_const1_64( CB1 ); \ VA = m512_const1_64( CB2 ); \ VB = m512_const1_64( CB3 ); \ - VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ - m512_const1_64( CB4 ) ); \ - VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \ - m512_const1_64( CB5 ) ); \ - VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \ - m512_const1_64( CB6 ) ); \ - VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \ - m512_const1_64( CB7 ) ); \ + VC = _mm512_set1_epi64( T0 ^ CB4 ); \ + VD = _mm512_set1_epi64( T0 ^ CB5 ); \ + VE = _mm512_set1_epi64( T1 ^ CB6 ); \ + VF = _mm512_set1_epi64( T1 ^ CB7 ); \ shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \ 0x28292a2b2c2d2e2f, 0x2021222324252627, \ 0x18191a1b1c1d1e1f, 0x1011121314151617, \ @@ -435,14 +431,10 @@ void blake512_8way_compress( blake_8way_big_context *sc ) V9 = m512_const1_64( CB1 ); VA = m512_const1_64( CB2 ); VB = m512_const1_64( CB3 ); - VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), - m512_const1_64( CB4 ) ); - VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ), - m512_const1_64( CB5 ) ); - VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ), - m512_const1_64( CB6 ) ); - VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ), - m512_const1_64( CB7 ) ); + VC = _mm512_set1_epi64( sc->T0 ^ CB4 ); + VD = _mm512_set1_epi64( sc->T0 ^ CB5 ); + VE = _mm512_set1_epi64( sc->T1 ^ CB6 ); + VF = _mm512_set1_epi64( sc->T1 ^ CB7 ); shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, 0x28292a2b2c2d2e2f, 0x2021222324252627, @@ -493,6 +485,241 @@ void blake512_8way_compress( blake_8way_big_context *sc ) sc->H[7] = mm512_xor3( VF, V7, sc->H[7] ); } +// won't be used after prehash implemented +void blake512_8way_compress_le( blake_8way_big_context *sc ) +{ + __m512i M0, M1, M2, M3, M4, M5, M6, M7; + __m512i M8, M9, MA, MB, MC, MD, ME, MF; + __m512i V0, V1, V2, V3, V4, V5, V6, V7; + __m512i V8, V9, VA, VB, VC, VD, VE, VF; + + V0 = sc->H[0]; + V1 = sc->H[1]; + V2 = sc->H[2]; + V3 = sc->H[3]; + V4 = sc->H[4]; + V5 = sc->H[5]; + V6 = sc->H[6]; + V7 = sc->H[7]; + V8 = m512_const1_64( CB0 ); + V9 = m512_const1_64( CB1 ); + VA = m512_const1_64( CB2 ); + VB = m512_const1_64( CB3 ); + VC = _mm512_set1_epi64( sc->T0 ^ CB4 ); + VD = _mm512_set1_epi64( sc->T0 ^ CB5 ); + VE = _mm512_set1_epi64( sc->T1 ^ CB6 ); + VF = _mm512_set1_epi64( sc->T1 ^ CB7 ); + + M0 = sc->buf[ 0]; + M1 = sc->buf[ 1]; + M2 = sc->buf[ 2]; + M3 = sc->buf[ 3]; + M4 = sc->buf[ 4]; + M5 = sc->buf[ 5]; + M6 = sc->buf[ 6]; + M7 = sc->buf[ 7]; + M8 = sc->buf[ 8]; + M9 = sc->buf[ 9]; + MA = sc->buf[10]; + MB = sc->buf[11]; + MC = sc->buf[12]; + MD = sc->buf[13]; + ME = sc->buf[14]; + MF = sc->buf[15]; + + ROUND_B_8WAY(0); + ROUND_B_8WAY(1); + ROUND_B_8WAY(2); + ROUND_B_8WAY(3); + ROUND_B_8WAY(4); + ROUND_B_8WAY(5); + ROUND_B_8WAY(6); + ROUND_B_8WAY(7); + ROUND_B_8WAY(8); + ROUND_B_8WAY(9); + ROUND_B_8WAY(0); + ROUND_B_8WAY(1); + ROUND_B_8WAY(2); + ROUND_B_8WAY(3); + ROUND_B_8WAY(4); + ROUND_B_8WAY(5); + + sc->H[0] = mm512_xor3( V8, V0, sc->H[0] ); + sc->H[1] = mm512_xor3( V9, V1, sc->H[1] ); + sc->H[2] = mm512_xor3( VA, V2, sc->H[2] ); + sc->H[3] = mm512_xor3( VB, V3, sc->H[3] ); + sc->H[4] = mm512_xor3( VC, V4, sc->H[4] ); + sc->H[5] = mm512_xor3( VD, V5, sc->H[5] ); + sc->H[6] = mm512_xor3( VE, V6, sc->H[6] ); + sc->H[7] = mm512_xor3( VF, V7, sc->H[7] ); +} + +// with final_le forms a full hash in 2 parts from little endian data. +// all variables hard coded for 80 bytes/lane. +void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate, + const void *data ) +{ + __m512i V0, V1, V2, V3, V4, V5, V6, V7; + __m512i V8, V9, VA, VB, VC, VD, VE, VF; + + // initial hash + casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); + casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B ); + casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B ); + casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 ); + casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 ); + casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F ); + casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); + casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); + + // fill buffer + memcpy_512( sc->buf, (__m512i*)data, 80>>3 ); + sc->buf[10] = m512_const1_64( 0x8000000000000000ULL ); + sc->buf[11] = + sc->buf[12] = m512_zero; + sc->buf[13] = m512_one_64; + sc->buf[14] = m512_zero; + sc->buf[15] = m512_const1_64( 80*8 ); + + // build working variables + V0 = sc->H[0]; + V1 = sc->H[1]; + V2 = sc->H[2]; + V3 = sc->H[3]; + V4 = sc->H[4]; + V5 = sc->H[5]; + V6 = sc->H[6]; + V7 = sc->H[7]; + V8 = m512_const1_64( CB0 ); + V9 = m512_const1_64( CB1 ); + VA = m512_const1_64( CB2 ); + VB = m512_const1_64( CB3 ); + VC = _mm512_set1_epi64( CB4 ^ 0x280ULL ); + VD = _mm512_set1_epi64( CB5 ^ 0x280ULL ); + VE = _mm512_set1_epi64( CB6 ); + VF = _mm512_set1_epi64( CB7 ); + + // skip the nonce + GB_8WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC ); + GB_8WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD ); + GB_8WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE ); + GB_8WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF ); + + // Do half of G4 + // GB_8WAY( sc->buf[ 8], sc->buf[ 9], CBx(0, 8), CBx(0, 9), V0, V5, VA, VF ); + + V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( + _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); + VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); + VA = _mm512_add_epi64( VA, VF ); + V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 ); + V0 = _mm512_add_epi64( V0, V5 ); + + GB_8WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC ); + GB_8WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD ); + GB_8WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE ); + + // save midstate for second part + midstate[ 0] = V0; + midstate[ 1] = V1; + midstate[ 2] = V2; + midstate[ 3] = V3; + midstate[ 4] = V4; + midstate[ 5] = V5; + midstate[ 6] = V6; + midstate[ 7] = V7; + midstate[ 8] = V8; + midstate[ 9] = V9; + midstate[10] = VA; + midstate[11] = VB; + midstate[12] = VC; + midstate[13] = VD; + midstate[14] = VE; + midstate[15] = VF; +} + +// pick up where we left off, need the nonce now. +void blake512_8way_final_le( blake_8way_big_context *sc, void *hash, + const __m512i nonce, const __m512i *midstate ) +{ + __m512i M0, M1, M2, M3, M4, M5, M6, M7; + __m512i M8, M9, MA, MB, MC, MD, ME, MF; + __m512i V0, V1, V2, V3, V4, V5, V6, V7; + __m512i V8, V9, VA, VB, VC, VD, VE, VF; + __m512i h[8] __attribute__ ((aligned (64))); + + // Load data with new nonce + M0 = sc->buf[ 0]; + M1 = sc->buf[ 1]; + M2 = sc->buf[ 2]; + M3 = sc->buf[ 3]; + M4 = sc->buf[ 4]; + M5 = sc->buf[ 5]; + M6 = sc->buf[ 6]; + M7 = sc->buf[ 7]; + M8 = sc->buf[ 8]; + M9 = nonce; + MA = sc->buf[10]; + MB = sc->buf[11]; + MC = sc->buf[12]; + MD = sc->buf[13]; + ME = sc->buf[14]; + MF = sc->buf[15]; + + V0 = midstate[ 0]; + V1 = midstate[ 1]; + V2 = midstate[ 2]; + V3 = midstate[ 3]; + V4 = midstate[ 4]; + V5 = midstate[ 5]; + V6 = midstate[ 6]; + V7 = midstate[ 7]; + V8 = midstate[ 8]; + V9 = midstate[ 9]; + VA = midstate[10]; + VB = midstate[11]; + VC = midstate[12]; + VD = midstate[13]; + VE = midstate[14]; + VF = midstate[15]; + + // finish round 0 with the nonce now available + V0 = _mm512_add_epi64( V0, _mm512_xor_si512( + _mm512_set1_epi64( CB8 ), M9 ) ); + VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 16 ); + VA = _mm512_add_epi64( VA, VF ); + V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 11 ); + + // remaining rounds + ROUND_B_8WAY(1); + ROUND_B_8WAY(2); + ROUND_B_8WAY(3); + ROUND_B_8WAY(4); + ROUND_B_8WAY(5); + ROUND_B_8WAY(6); + ROUND_B_8WAY(7); + ROUND_B_8WAY(8); + ROUND_B_8WAY(9); + ROUND_B_8WAY(0); + ROUND_B_8WAY(1); + ROUND_B_8WAY(2); + ROUND_B_8WAY(3); + ROUND_B_8WAY(4); + ROUND_B_8WAY(5); + + h[0] = mm512_xor3( V8, V0, sc->H[0] ); + h[1] = mm512_xor3( V9, V1, sc->H[1] ); + h[2] = mm512_xor3( VA, V2, sc->H[2] ); + h[3] = mm512_xor3( VB, V3, sc->H[3] ); + h[4] = mm512_xor3( VC, V4, sc->H[4] ); + h[5] = mm512_xor3( VD, V5, sc->H[5] ); + h[6] = mm512_xor3( VE, V6, sc->H[6] ); + h[7] = mm512_xor3( VF, V7, sc->H[7] ); + + // bswap final hash + mm512_block_bswap_64( (__m512i*)hash, h ); +} + void blake512_8way_init( blake_8way_big_context *sc ) { casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); @@ -678,6 +905,73 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst, mm512_block_bswap_64( (__m512i*)dst, sc->H ); } +void blake512_8way_full_le( blake_8way_big_context *sc, void * dst, + const void *data, size_t len ) +{ + +// init + + casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 ); + casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B ); + casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B ); + casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 ); + casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 ); + casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F ); + casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B ); + casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 ); + + sc->T0 = sc->T1 = 0; + sc->ptr = 0; + +// update + + memcpy_512( sc->buf, (__m512i*)data, len>>3 ); + sc->ptr = len; + if ( len == 128 ) + { + if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) + sc->T1 = sc->T1 + 1; + blake512_8way_compress_le( sc ); + sc->ptr = 0; + } + +// close + + size_t ptr64 = sc->ptr >> 3; + unsigned bit_len; + uint64_t th, tl; + + bit_len = sc->ptr << 3; + sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL ); + tl = sc->T0 + bit_len; + th = sc->T1; + + if ( ptr64 == 0 ) + { + sc->T0 = 0xFFFFFFFFFFFFFC00ULL; + sc->T1 = 0xFFFFFFFFFFFFFFFFULL; + } + else if ( sc->T0 == 0 ) + { + sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len; + sc->T1 = sc->T1 - 1; + } + else + sc->T0 -= 1024 - bit_len; + + memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 ); + sc->buf[13] = m512_one_64; + sc->buf[14] = m512_const1_64( th ); + sc->buf[15] = m512_const1_64( tl ); + + if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) + sc->T1 = sc->T1 + 1; + + blake512_8way_compress_le( sc ); + + mm512_block_bswap_64( (__m512i*)dst, sc->H ); +} + void blake512_8way_update(void *cc, const void *data, size_t len) { @@ -741,14 +1035,10 @@ blake512_8way_close(void *cc, void *dst) V9 = m256_const1_64( CB1 ); \ VA = m256_const1_64( CB2 ); \ VB = m256_const1_64( CB3 ); \ - VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ - m256_const1_64( CB4 ) ); \ - VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ - m256_const1_64( CB5 ) ); \ - VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ - m256_const1_64( CB6 ) ); \ - VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ - m256_const1_64( CB7 ) ); \ + VC = _mm256_set1_epi64x( T0 ^ CB4 ); \ + VD = _mm256_set1_epi64x( T0 ^ CB5 ); \ + VE = _mm256_set1_epi64x( T1 ^ CB6 ); \ + VF = _mm256_set1_epi64x( T1 ^ CB7 ); \ shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \ 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ @@ -869,6 +1159,166 @@ void blake512_4way_compress( blake_4way_big_context *sc ) sc->H[7] = mm256_xor3( VF, V7, sc->H[7] ); } +void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate, + const void *data ) +{ + __m256i V0, V1, V2, V3, V4, V5, V6, V7; + __m256i V8, V9, VA, VB, VC, VD, VE, VF; + + // initial hash + casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); + casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B ); + casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B ); + casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 ); + casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 ); + casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F ); + casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); + casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); + + // fill buffer + memcpy_256( sc->buf, (__m256i*)data, 80>>3 ); + sc->buf[10] = m256_const1_64( 0x8000000000000000ULL ); + sc->buf[11] = m256_zero; + sc->buf[12] = m256_zero; + sc->buf[13] = m256_one_64; + sc->buf[14] = m256_zero; + sc->buf[15] = m256_const1_64( 80*8 ); + + // build working variables + V0 = sc->H[0]; + V1 = sc->H[1]; + V2 = sc->H[2]; + V3 = sc->H[3]; + V4 = sc->H[4]; + V5 = sc->H[5]; + V6 = sc->H[6]; + V7 = sc->H[7]; + V8 = m256_const1_64( CB0 ); + V9 = m256_const1_64( CB1 ); + VA = m256_const1_64( CB2 ); + VB = m256_const1_64( CB3 ); + VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL ); + VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL ); + VE = _mm256_set1_epi64x( CB6 ); + VF = _mm256_set1_epi64x( CB7 ); + + GB_4WAY( sc->buf[ 0], sc->buf[ 1], CB0, CB1, V0, V4, V8, VC ); + GB_4WAY( sc->buf[ 2], sc->buf[ 3], CB2, CB3, V1, V5, V9, VD ); + GB_4WAY( sc->buf[ 4], sc->buf[ 5], CB4, CB5, V2, V6, VA, VE ); + GB_4WAY( sc->buf[ 6], sc->buf[ 7], CB6, CB7, V3, V7, VB, VF ); + + // skip nonce + V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( + _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 ); + VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 ); + VA = _mm256_add_epi64( VA, VF ); + V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 ); + V0 = _mm256_add_epi64( V0, V5 ); + + GB_4WAY( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC ); + GB_4WAY( sc->buf[12], sc->buf[13], CBC, CBD, V2, V7, V8, VD ); + GB_4WAY( sc->buf[14], sc->buf[15], CBE, CBF, V3, V4, V9, VE ); + + // save midstate for second part + midstate[ 0] = V0; + midstate[ 1] = V1; + midstate[ 2] = V2; + midstate[ 3] = V3; + midstate[ 4] = V4; + midstate[ 5] = V5; + midstate[ 6] = V6; + midstate[ 7] = V7; + midstate[ 8] = V8; + midstate[ 9] = V9; + midstate[10] = VA; + midstate[11] = VB; + midstate[12] = VC; + midstate[13] = VD; + midstate[14] = VE; + midstate[15] = VF; +} + +void blake512_4way_final_le( blake_4way_big_context *sc, void *hash, + const __m256i nonce, const __m256i *midstate ) +{ + __m256i M0, M1, M2, M3, M4, M5, M6, M7; + __m256i M8, M9, MA, MB, MC, MD, ME, MF; + __m256i V0, V1, V2, V3, V4, V5, V6, V7; + __m256i V8, V9, VA, VB, VC, VD, VE, VF; + __m256i h[8] __attribute__ ((aligned (64))); + + // Load data with new nonce + M0 = sc->buf[ 0]; + M1 = sc->buf[ 1]; + M2 = sc->buf[ 2]; + M3 = sc->buf[ 3]; + M4 = sc->buf[ 4]; + M5 = sc->buf[ 5]; + M6 = sc->buf[ 6]; + M7 = sc->buf[ 7]; + M8 = sc->buf[ 8]; + M9 = nonce; + MA = sc->buf[10]; + MB = sc->buf[11]; + MC = sc->buf[12]; + MD = sc->buf[13]; + ME = sc->buf[14]; + MF = sc->buf[15]; + + V0 = midstate[ 0]; + V1 = midstate[ 1]; + V2 = midstate[ 2]; + V3 = midstate[ 3]; + V4 = midstate[ 4]; + V5 = midstate[ 5]; + V6 = midstate[ 6]; + V7 = midstate[ 7]; + V8 = midstate[ 8]; + V9 = midstate[ 9]; + VA = midstate[10]; + VB = midstate[11]; + VC = midstate[12]; + VD = midstate[13]; + VE = midstate[14]; + VF = midstate[15]; + + // finish round 0, with the nonce now available + V0 = _mm256_add_epi64( V0, _mm256_xor_si256( + _mm256_set1_epi64x( CB8 ), M9 ) ); + VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 ); + VA = _mm256_add_epi64( VA, VF ); + V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 ); + + ROUND_B_4WAY(1); + ROUND_B_4WAY(2); + ROUND_B_4WAY(3); + ROUND_B_4WAY(4); + ROUND_B_4WAY(5); + ROUND_B_4WAY(6); + ROUND_B_4WAY(7); + ROUND_B_4WAY(8); + ROUND_B_4WAY(9); + ROUND_B_4WAY(0); + ROUND_B_4WAY(1); + ROUND_B_4WAY(2); + ROUND_B_4WAY(3); + ROUND_B_4WAY(4); + ROUND_B_4WAY(5); + + h[0] = mm256_xor3( V8, V0, sc->H[0] ); + h[1] = mm256_xor3( V9, V1, sc->H[1] ); + h[2] = mm256_xor3( VA, V2, sc->H[2] ); + h[3] = mm256_xor3( VB, V3, sc->H[3] ); + h[4] = mm256_xor3( VC, V4, sc->H[4] ); + h[5] = mm256_xor3( VD, V5, sc->H[5] ); + h[6] = mm256_xor3( VE, V6, sc->H[6] ); + h[7] = mm256_xor3( VF, V7, sc->H[7] ); + + // bswap final hash + mm256_block_bswap_64( (__m256i*)hash, h ); +} + + void blake512_4way_init( blake_4way_big_context *sc ) { casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); diff --git a/algo/cubehash/cube-hash-2way.c b/algo/cubehash/cube-hash-2way.c index 06f7e095..0710a576 100644 --- a/algo/cubehash/cube-hash-2way.c +++ b/algo/cubehash/cube-hash-2way.c @@ -54,14 +54,12 @@ static void transform_4way( cube_4way_context *sp ) x5 = _mm512_add_epi32( x1, x5 ); x6 = _mm512_add_epi32( x2, x6 ); x7 = _mm512_add_epi32( x3, x7 ); - y0 = x0; - y1 = x1; - x0 = mm512_rol_32( x2, 7 ); - x1 = mm512_rol_32( x3, 7 ); - x2 = mm512_rol_32( y0, 7 ); - x3 = mm512_rol_32( y1, 7 ); - x0 = _mm512_xor_si512( x0, x4 ); - x1 = _mm512_xor_si512( x1, x5 ); + y0 = mm512_rol_32( x2, 7 ); + y1 = mm512_rol_32( x3, 7 ); + x2 = mm512_rol_32( x0, 7 ); + x3 = mm512_rol_32( x1, 7 ); + x0 = _mm512_xor_si512( y0, x4 ); + x1 = _mm512_xor_si512( y1, x5 ); x2 = _mm512_xor_si512( x2, x6 ); x3 = _mm512_xor_si512( x3, x7 ); x4 = mm512_swap128_64( x4 ); @@ -72,15 +70,13 @@ static void transform_4way( cube_4way_context *sp ) x5 = _mm512_add_epi32( x1, x5 ); x6 = _mm512_add_epi32( x2, x6 ); x7 = _mm512_add_epi32( x3, x7 ); - y0 = x0; - y1 = x2; - x0 = mm512_rol_32( x1, 11 ); - x1 = mm512_rol_32( y0, 11 ); - x2 = mm512_rol_32( x3, 11 ); - x3 = mm512_rol_32( y1, 11 ); - x0 = _mm512_xor_si512( x0, x4 ); + y0 = mm512_rol_32( x1, 11 ); + x1 = mm512_rol_32( x0, 11 ); + y1 = mm512_rol_32( x3, 11 ); + x3 = mm512_rol_32( x2, 11 ); + x0 = _mm512_xor_si512( y0, x4 ); x1 = _mm512_xor_si512( x1, x5 ); - x2 = _mm512_xor_si512( x2, x6 ); + x2 = _mm512_xor_si512( y1, x6 ); x3 = _mm512_xor_si512( x3, x7 ); x4 = mm512_swap64_32( x4 ); x5 = mm512_swap64_32( x5 ); @@ -131,83 +127,67 @@ static void transform_4way_2buf( cube_4way_2buf_context *sp ) { x4 = _mm512_add_epi32( x0, x4 ); y4 = _mm512_add_epi32( y0, y4 ); - tx0 = x0; - ty0 = y0; x5 = _mm512_add_epi32( x1, x5 ); y5 = _mm512_add_epi32( y1, y5 ); - tx1 = x1; - ty1 = y1; - x0 = mm512_rol_32( x2, 7 ); - y0 = mm512_rol_32( y2, 7 ); + tx0 = mm512_rol_32( x2, 7 ); + ty0 = mm512_rol_32( y2, 7 ); + tx1 = mm512_rol_32( x3, 7 ); + ty1 = mm512_rol_32( y3, 7 ); x6 = _mm512_add_epi32( x2, x6 ); - y6 = _mm512_add_epi32( y2, y6 ); - x1 = mm512_rol_32( x3, 7 ); - y1 = mm512_rol_32( y3, 7 ); + y6 = _mm512_add_epi32( y2, y6 ); x7 = _mm512_add_epi32( x3, x7 ); y7 = _mm512_add_epi32( y3, y7 ); - - - x2 = mm512_rol_32( tx0, 7 ); - y2 = mm512_rol_32( ty0, 7 ); - x0 = _mm512_xor_si512( x0, x4 ); - y0 = _mm512_xor_si512( y0, y4 ); + x2 = mm512_rol_32( x0, 7 ); + y2 = mm512_rol_32( y0, 7 ); + x3 = mm512_rol_32( x1, 7 ); + y3 = mm512_rol_32( y1, 7 ); + x0 = _mm512_xor_si512( tx0, x4 ); + y0 = _mm512_xor_si512( ty0, y4 ); + x1 = _mm512_xor_si512( tx1, x5 ); + y1 = _mm512_xor_si512( ty1, y5 ); x4 = mm512_swap128_64( x4 ); - x3 = mm512_rol_32( tx1, 7 ); - y3 = mm512_rol_32( ty1, 7 ); y4 = mm512_swap128_64( y4 ); - - x1 = _mm512_xor_si512( x1, x5 ); - y1 = _mm512_xor_si512( y1, y5 ); x5 = mm512_swap128_64( x5 ); + y5 = mm512_swap128_64( y5 ); x2 = _mm512_xor_si512( x2, x6 ); y2 = _mm512_xor_si512( y2, y6 ); - y5 = mm512_swap128_64( y5 ); x3 = _mm512_xor_si512( x3, x7 ); y3 = _mm512_xor_si512( y3, y7 ); - x6 = mm512_swap128_64( x6 ); + y6 = mm512_swap128_64( y6 ); + x7 = mm512_swap128_64( x7 ); + y7 = mm512_swap128_64( y7 ); x4 = _mm512_add_epi32( x0, x4 ); y4 = _mm512_add_epi32( y0, y4 ); - y6 = mm512_swap128_64( y6 ); x5 = _mm512_add_epi32( x1, x5 ); y5 = _mm512_add_epi32( y1, y5 ); - x7 = mm512_swap128_64( x7 ); + tx0 = mm512_rol_32( x1, 11 ); + ty0 = mm512_rol_32( y1, 11 ); + tx1 = mm512_rol_32( x3, 11 ); + ty1 = mm512_rol_32( y3, 11 ); x6 = _mm512_add_epi32( x2, x6 ); y6 = _mm512_add_epi32( y2, y6 ); - tx0 = x0; - ty0 = y0; - y7 = mm512_swap128_64( y7 ); - tx1 = x2; - ty1 = y2; - x0 = mm512_rol_32( x1, 11 ); - y0 = mm512_rol_32( y1, 11 ); - x7 = _mm512_add_epi32( x3, x7 ); y7 = _mm512_add_epi32( y3, y7 ); - - x1 = mm512_rol_32( tx0, 11 ); - y1 = mm512_rol_32( ty0, 11 ); - x0 = _mm512_xor_si512( x0, x4 ); + x1 = mm512_rol_32( x0, 11 ); + y1 = mm512_rol_32( y0, 11 ); + x3 = mm512_rol_32( x2, 11 ); + y3 = mm512_rol_32( y2, 11 ); + x0 = _mm512_xor_si512( tx0, x4 ); + y0 = _mm512_xor_si512( ty0, y4 ); + x1 = _mm512_xor_si512( x1, x5 ); + y1 = _mm512_xor_si512( y1, y5 ); x4 = mm512_swap64_32( x4 ); - y0 = _mm512_xor_si512( y0, y4 ); - x2 = mm512_rol_32( x3, 11 ); y4 = mm512_swap64_32( y4 ); - y2 = mm512_rol_32( y3, 11 ); - x1 = _mm512_xor_si512( x1, x5 ); x5 = mm512_swap64_32( x5 ); - y1 = _mm512_xor_si512( y1, y5 ); - x3 = mm512_rol_32( tx1, 11 ); y5 = mm512_swap64_32( y5 ); - y3 = mm512_rol_32( ty1, 11 ); - - x2 = _mm512_xor_si512( x2, x6 ); + x2 = _mm512_xor_si512( tx1, x6 ); + y2 = _mm512_xor_si512( ty1, y6 ); + x3 = _mm512_xor_si512( x3, x7 ); + y3 = _mm512_xor_si512( y3, y7 ); x6 = mm512_swap64_32( x6 ); - y2 = _mm512_xor_si512( y2, y6 ); y6 = mm512_swap64_32( y6 ); - x3 = _mm512_xor_si512( x3, x7 ); x7 = mm512_swap64_32( x7 ); - y3 = _mm512_xor_si512( y3, y7 ); - y7 = mm512_swap64_32( y7 ); } @@ -241,14 +221,6 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds, sp->rounds = rounds; sp->pos = 0; - h[ 0] = m512_const1_128( iv[0] ); - h[ 1] = m512_const1_128( iv[1] ); - h[ 2] = m512_const1_128( iv[2] ); - h[ 3] = m512_const1_128( iv[3] ); - h[ 4] = m512_const1_128( iv[4] ); - h[ 5] = m512_const1_128( iv[5] ); - h[ 6] = m512_const1_128( iv[6] ); - h[ 7] = m512_const1_128( iv[7] ); h[ 0] = m512_const1_128( iv[0] ); h[ 1] = m512_const1_128( iv[1] ); h[ 2] = m512_const1_128( iv[2] ); @@ -489,33 +461,29 @@ static void transform_2way( cube_2way_context *sp ) x5 = _mm256_add_epi32( x1, x5 ); x6 = _mm256_add_epi32( x2, x6 ); x7 = _mm256_add_epi32( x3, x7 ); - y0 = x0; - y1 = x1; - ROL2( x0, x1, x2, x3, 7 ); - ROL2( x2, x3, y0, y1, 7 ); - x0 = _mm256_xor_si256( x0, x4 ); - x4 = mm256_swap128_64( x4 ); - x1 = _mm256_xor_si256( x1, x5 ); + ROL2( y0, y1, x2, x3, 7 ); + ROL2( x2, x3, x0, x1, 7 ); + x0 = _mm256_xor_si256( y0, x4 ); + x1 = _mm256_xor_si256( y1, x5 ); x2 = _mm256_xor_si256( x2, x6 ); - x5 = mm256_swap128_64( x5 ); x3 = _mm256_xor_si256( x3, x7 ); - x4 = _mm256_add_epi32( x0, x4 ); + x4 = mm256_swap128_64( x4 ); + x5 = mm256_swap128_64( x5 ); x6 = mm256_swap128_64( x6 ); - y0 = x0; - x5 = _mm256_add_epi32( x1, x5 ); x7 = mm256_swap128_64( x7 ); + x4 = _mm256_add_epi32( x0, x4 ); + x5 = _mm256_add_epi32( x1, x5 ); x6 = _mm256_add_epi32( x2, x6 ); - y1 = x2; - ROL2( x0, x1, x1, y0, 11 ); x7 = _mm256_add_epi32( x3, x7 ); - ROL2( x2, x3, x3, y1, 11 ); - x0 = _mm256_xor_si256( x0, x4 ); - x4 = mm256_swap64_32( x4 ); + ROL2( y0, x1, x1, x0, 11 ); + ROL2( y1, x3, x3, x2, 11 ); + x0 = _mm256_xor_si256( y0, x4 ); x1 = _mm256_xor_si256( x1, x5 ); + x2 = _mm256_xor_si256( y1, x6 ); + x3 = _mm256_xor_si256( x3, x7 ); + x4 = mm256_swap64_32( x4 ); x5 = mm256_swap64_32( x5 ); - x2 = _mm256_xor_si256( x2, x6 ); x6 = mm256_swap64_32( x6 ); - x3 = _mm256_xor_si256( x3, x7 ); x7 = mm256_swap64_32( x7 ); } @@ -540,14 +508,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds, sp->rounds = rounds; sp->pos = 0; - h[ 0] = m256_const1_128( iv[0] ); - h[ 1] = m256_const1_128( iv[1] ); - h[ 2] = m256_const1_128( iv[2] ); - h[ 3] = m256_const1_128( iv[3] ); - h[ 4] = m256_const1_128( iv[4] ); - h[ 5] = m256_const1_128( iv[5] ); - h[ 6] = m256_const1_128( iv[6] ); - h[ 7] = m256_const1_128( iv[7] ); h[ 0] = m256_const1_128( iv[0] ); h[ 1] = m256_const1_128( iv[1] ); h[ 2] = m256_const1_128( iv[2] ); @@ -560,7 +520,6 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds, return 0; } - int cube_2way_update( cube_2way_context *sp, const void *data, size_t size ) { const int len = size >> 4; diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index f16047e9..90e13e71 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -26,6 +26,7 @@ typedef struct { } allium_16way_ctx_holder; static __thread allium_16way_ctx_holder allium_16way_ctx; +static __thread __m512i blake256_16way_midstate[16]; bool init_allium_16way_ctx() { @@ -58,8 +59,9 @@ void allium_16way_hash( void *state, const void *input ) allium_16way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) ); - blake256_16way_update( &ctx.blake, input + (64<<4), 16 ); - blake256_16way_close( &ctx.blake, vhash ); + ctx.blake.buf[3] = casti_m512i( input, 19 ); // grab nonce from input + blake256_16way_final_rounds_le( vhash, blake256_16way_midstate, ctx.blake.H, + ctx.blake.buf ); dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, @@ -198,6 +200,7 @@ void allium_16way_hash( void *state, const void *input ) groestl256_full( &ctx.groestl, state+416, hash13, 256 ); groestl256_full( &ctx.groestl, state+448, hash14, 256 ); groestl256_full( &ctx.groestl, state+480, hash15, 256 ); + #endif } @@ -214,15 +217,29 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, __m512i *noncev = (__m512i*)vdata + 19; // aligned const int thr_id = mythr->id; const bool bench = opt_benchmark; + const __m512i sixteen = m512_const1_32( 16 ); if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff; - mm512_bswap32_intrlv80_16x32( vdata, pdata ); + for ( int i = 0; i < 19; i++ ) + casti_m512i( vdata, i ) = _mm512_set1_epi32( pdata[i] ); *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); + // Prehash first block blake256_16way_init( &allium_16way_ctx.blake ); - blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 ); + blake256_16way_update_le( &allium_16way_ctx.blake, vdata, 64 ); + + // Prehash second block, fill buf with last 16 bytes and add padding. + memcpy_512( allium_16way_ctx.blake.buf, (__m512i*)vdata + 16, 4 ); + allium_16way_ctx.blake.buf[ 4] = m512_const1_32( 0x80000000 ); + memset_zero_512( allium_16way_ctx.blake.buf + 5, 8 ); + allium_16way_ctx.blake.buf[13] = m512_one_32; + allium_16way_ctx.blake.buf[14] = m512_zero; + allium_16way_ctx.blake.buf[15] = m512_const1_32( 80*8 ); + + blake256_16way_round0_prehash_le( blake256_16way_midstate, + allium_16way_ctx.blake.H, allium_16way_ctx.blake.buf ); do { allium_16way_hash( hash, vdata ); @@ -230,10 +247,10 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, for ( int lane = 0; lane < 16; lane++ ) if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) ) { - pdata[19] = bswap_32( n + lane ); - submit_solution( work, hash+(lane<<3), mythr ); + pdata[19] = n + lane; + submit_solution( work, hash+(lane<<3), mythr ); } - *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) ); + *noncev = _mm512_add_epi32( *noncev, sixteen ); n += 16; } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) ); pdata[19] = n; @@ -256,6 +273,7 @@ typedef struct { } allium_8way_ctx_holder; static __thread allium_8way_ctx_holder allium_8way_ctx; +static __thread __m256i blake256_8way_midstate[16]; bool init_allium_8way_ctx() { @@ -279,8 +297,9 @@ void allium_8way_hash( void *hash, const void *input ) allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) ); - blake256_8way_update( &ctx.blake, input + (64<<3), 16 ); - blake256_8way_close( &ctx.blake, vhashA ); + ctx.blake.buf[3] = casti_m256i( input, 19 ); // grab nonce from input + blake256_8way_final_rounds_le( vhashA, blake256_8way_midstate, ctx.blake.H, + ctx.blake.buf ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhashA, 256 ); @@ -386,11 +405,24 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, const int thr_id = mythr->id; const bool bench = opt_benchmark; - mm256_bswap32_intrlv80_8x32( vdata, pdata ); + for ( int i = 0; i < 19; i++ ) + casti_m256i( vdata, i ) = _mm256_set1_epi32( pdata[i] ); *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); + // Prehash first block blake256_8way_init( &allium_8way_ctx.blake ); - blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 ); + blake256_8way_update_le( &allium_8way_ctx.blake, vdata, 64 ); + + // Prehash second block, fill buf with last 16 bytes and add padding. + memcpy_256( allium_8way_ctx.blake.buf, (__m256i*)vdata + 16, 4 ); + allium_8way_ctx.blake.buf[ 4] = m256_const1_32( 0x80000000 ); + memset_zero_256( allium_8way_ctx.blake.buf + 5, 8 ); + allium_8way_ctx.blake.buf[13] = m256_one_32; + allium_8way_ctx.blake.buf[14] = m256_zero; + allium_8way_ctx.blake.buf[15] = m256_const1_32( 80*8 ); + + blake256_8way_round0_prehash_le( blake256_8way_midstate, + allium_8way_ctx.blake.H, allium_8way_ctx.blake.buf ); do { allium_8way_hash( hash, vdata ); @@ -400,7 +432,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, const uint64_t *lane_hash = hash + (lane<<2); if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) ) { - pdata[19] = bswap_32( n + lane ); + pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); } } diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 531ce5d5..79b5731c 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -14,12 +14,25 @@ bool lyra2z_16way_thread_init() return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) ); } -static __thread blake256_16way_context l2z_16way_blake_mid; +static __thread blake256_16way_context l2z_16way_blake_ctx; +static __thread __m512i blake256_16way_midstate[16]; void lyra2z_16way_midstate( const void* input ) { - blake256_16way_init( &l2z_16way_blake_mid ); - blake256_16way_update( &l2z_16way_blake_mid, input, 64 ); + // First block + blake256_16way_init( &l2z_16way_blake_ctx ); + blake256_16way_update_le( &l2z_16way_blake_ctx, input, 64 ); + + // Second block + memcpy_512( l2z_16way_blake_ctx.buf, (__m512i*)input + 16, 4 ); + l2z_16way_blake_ctx.buf[ 4] = m512_const1_32( 0x80000000 ); + memset_zero_512( l2z_16way_blake_ctx.buf + 5, 8 ); + l2z_16way_blake_ctx.buf[13] = m512_one_32; + l2z_16way_blake_ctx.buf[14] = m512_zero; + l2z_16way_blake_ctx.buf[15] = m512_const1_32( 80*8 ); + + blake256_16way_round0_prehash_le( blake256_16way_midstate, + l2z_16way_blake_ctx.H, l2z_16way_blake_ctx.buf ); } void lyra2z_16way_hash( void *state, const void *input ) @@ -43,9 +56,11 @@ void lyra2z_16way_hash( void *state, const void *input ) uint32_t hash15[8] __attribute__ ((aligned (64))); blake256_16way_context ctx_blake __attribute__ ((aligned (64))); - memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid ); - blake256_16way_update( &ctx_blake, input + (64*16), 16 ); - blake256_16way_close( &ctx_blake, vhash ); + memcpy( &ctx_blake, &l2z_16way_blake_ctx, sizeof l2z_16way_blake_ctx ); + + ctx_blake.buf[3] = casti_m512i( input, 19 ); // grab nonce from input + blake256_16way_final_rounds_le( vhash, blake256_16way_midstate, ctx_blake.H, + ctx_blake.buf ); dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15, @@ -107,10 +122,12 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, __m512i *noncev = (__m512i*)vdata + 19; // aligned const int thr_id = mythr->id; const bool bench = opt_benchmark; + const __m512i sixteen = m512_const1_32( 16 ); if ( bench ) ptarget[7] = 0x0000ff; - mm512_bswap32_intrlv80_16x32( vdata, pdata ); + for ( int i = 0; i < 19; i++ ) + casti_m512i( vdata, i ) = _mm512_set1_epi32( pdata[i] ); *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); lyra2z_16way_midstate( vdata ); @@ -123,11 +140,11 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, const uint64_t *lane_hash = hash + (lane<<2); if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) ) { - pdata[19] = bswap_32( n + lane ); + pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) ); + *noncev = _mm512_add_epi32( *noncev, sixteen ); n += 16; } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); @@ -145,12 +162,23 @@ bool lyra2z_8way_thread_init() return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) ); } -static __thread blake256_8way_context l2z_8way_blake_mid; +static __thread blake256_8way_context l2z_8way_blake_ctx; +static __thread __m256i blake256_8way_midstate[16]; void lyra2z_8way_midstate( const void* input ) { - blake256_8way_init( &l2z_8way_blake_mid ); - blake256_8way_update( &l2z_8way_blake_mid, input, 64 ); + blake256_8way_init( &l2z_8way_blake_ctx ); + blake256_8way_update_le( &l2z_8way_blake_ctx, input, 64 ); + + memcpy_256( l2z_8way_blake_ctx.buf, (__m256i*)input + 16, 4 ); + l2z_8way_blake_ctx.buf[ 4] = m256_const1_32( 0x80000000 ); + memset_zero_256( l2z_8way_blake_ctx.buf + 5, 8 ); + l2z_8way_blake_ctx.buf[13] = m256_one_32; + l2z_8way_blake_ctx.buf[14] = m256_zero; + l2z_8way_blake_ctx.buf[15] = m256_const1_32( 80*8 ); + + blake256_8way_round0_prehash_le( blake256_8way_midstate, + l2z_8way_blake_ctx.H, l2z_8way_blake_ctx.buf ); } void lyra2z_8way_hash( void *state, const void *input ) @@ -166,9 +194,11 @@ void lyra2z_8way_hash( void *state, const void *input ) uint32_t vhash[8*8] __attribute__ ((aligned (64))); blake256_8way_context ctx_blake __attribute__ ((aligned (64))); - memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid ); - blake256_8way_update( &ctx_blake, input + (64*8), 16 ); - blake256_8way_close( &ctx_blake, vhash ); + memcpy( &ctx_blake, &l2z_8way_blake_ctx, sizeof l2z_8way_blake_ctx ); + + ctx_blake.buf[3] = casti_m256i( input, 19 ); // grab nonce from input + blake256_8way_final_rounds_le( vhash, blake256_8way_midstate, ctx_blake.H, + ctx_blake.buf ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); @@ -206,10 +236,12 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, __m256i *noncev = (__m256i*)vdata + 19; // aligned const int thr_id = mythr->id; const bool bench = opt_benchmark; + const __m256i eight = m256_const1_32( 8 ); if ( bench ) ptarget[7] = 0x0000ff; - mm256_bswap32_intrlv80_8x32( vdata, pdata ); + for ( int i = 0; i < 19; i++ ) + casti_m256i( vdata, i ) = _mm256_set1_epi32( pdata[i] ); *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); lyra2z_8way_midstate( vdata ); @@ -221,11 +253,11 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, const uint64_t *lane_hash = hash + (lane<<2); if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) ) { - pdata[19] = bswap_32( n + lane ); + pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) ); + *noncev = _mm256_add_epi32( *noncev, eight ); n += 8; } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) ); pdata[19] = n; diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 1902a2de..fb3ff5bb 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -58,6 +58,9 @@ union _x17_8way_context_overlay } __attribute__ ((aligned (64))); typedef union _x17_8way_context_overlay x17_8way_context_overlay; +static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64))); +static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64))); + int x17_8way_hash( void *state, const void *input, int thr_id ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); @@ -73,8 +76,9 @@ int x17_8way_hash( void *state, const void *input, int thr_id ) uint64_t hash7[8] __attribute__ ((aligned (64))); x17_8way_context_overlay ctx; - blake512_8way_full( &ctx.blake, vhash, input, 80 ); - + blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ), + x17_8way_midstate ); + bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -122,9 +126,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id ) cube_4way_2buf_full( &ctx.cube, vhashA, vhashB, 512, vhashA, vhashB, 64 ); -// cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 ); -// cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 ); - #if defined(__VAES__) shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 ); @@ -237,6 +238,61 @@ int x17_8way_hash( void *state, const void *input, int thr_id ) return 1; } +int scanhash_x17_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash32[8*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); + __m128i edata[5] __attribute__ ((aligned (64))); + uint32_t *hash32_d7 = &(hash32[7*8]); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 8; + __m512i *noncev = (__m512i*)vdata + 9; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const uint32_t targ32_d7 = ptarget[7]; + const __m512i eight = m512_const1_64( 8 ); + const bool bench = opt_benchmark; + + edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); + edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) ); + edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) ); + edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) ); + edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) ); + + mm512_intrlv80_8x64( vdata, edata ); + + *noncev = mm512_intrlv_blend_32( *noncev, + _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4, + 0, n+3, 0, n+2, 0, n+1, 0, n ) ); + blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata ); + + do + { + if ( likely( x17_8way_hash( hash32, vdata, thr_id ) ) ) + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( ( hash32_d7[ lane ] <= targ32_d7 ) && !bench ) ) + { + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + *noncev = _mm512_add_epi32( *noncev, eight ); + n += 8; + } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + + + #elif defined(X17_4WAY) union _x17_4way_context_overlay diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index eee3d60d..be31125b 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -3,7 +3,7 @@ bool register_x17_algo( algo_gate_t* gate ) { #if defined (X17_8WAY) - gate->scanhash = (void*)&scanhash_8way_64in_32out; + gate->scanhash = (void*)&scanhash_x17_8way; gate->hash = (void*)&x17_8way_hash; #elif defined (X17_4WAY) gate->scanhash = (void*)&scanhash_4way_64in_32out; diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h index 003d77fe..f88a4d60 100644 --- a/algo/x17/x17-gate.h +++ b/algo/x17/x17-gate.h @@ -14,10 +14,15 @@ bool register_x17_algo( algo_gate_t* gate ); #if defined(X17_8WAY) +int scanhash_x17_8way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + int x17_8way_hash( void *state, const void *input, int thr_id ); #elif defined(X17_4WAY) +int scanhash_x17_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); int x17_4way_hash( void *state, const void *input, int thr_id ); #endif diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index 5acf3de5..a67eb145 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -21,7 +21,6 @@ #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" #include "algo/gost/sph_gost.h" -#include "algo/swifftx/swifftx.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" #include "algo/shavite/shavite-hash-4way.h" diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index 826f0f88..b8e087f1 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -50,6 +50,7 @@ bool register_x25x_algo( algo_gate_t* gate ) #endif gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT | VAES_OPT; + InitializeSWIFFTX(); return true; }; diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h index 0acedc70..4dc1bf24 100644 --- a/algo/x22/x22i-gate.h +++ b/algo/x22/x22i-gate.h @@ -5,6 +5,7 @@ #include "simd-utils.h" #include #include +#include "algo/swifftx/swifftx.h" #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define X22I_8WAY 1 diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index ff2888ec..bb7b3060 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -24,7 +24,6 @@ #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" #include "algo/gost/sph_gost.h" -#include "algo/swifftx/swifftx.h" #include "algo/panama/panama-hash-4way.h" #include "algo/lanehash/lane.h" #if defined(__VAES__) @@ -102,6 +101,9 @@ union _x25x_8way_ctx_overlay }; typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay; +static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64))); +static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64))); + int x25x_8way_hash( void *output, const void *input, int thrid ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); @@ -118,9 +120,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) uint64_t vhashB[8*8] __attribute__ ((aligned (64))); x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64))); - blake512_8way_init( &ctx.blake ); - blake512_8way_update( &ctx.blake, input, 80 ); - blake512_8way_close( &ctx.blake, vhash ); + blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ), + x25x_8way_midstate ); + dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0], hash4[0], hash5[0], hash6[0], hash7[0], vhash ); @@ -271,7 +273,6 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10], hash4[10], hash5[10], hash6[10], hash7[10] ); - #else init_echo( &ctx.echo, 512 ); @@ -558,6 +559,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce, { uint32_t hash[8*8] __attribute__ ((aligned (128))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); + __m128i edata[5] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t *hashd7 = &(hash[7*8]); uint32_t *pdata = work->data; @@ -569,15 +571,22 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce, const int thr_id = mythr->id; const uint32_t targ32 = ptarget[7]; const bool bench = opt_benchmark; - + const __m512i eight = m512_const1_64( 8 ); if ( bench ) ptarget[7] = 0x08ff; - InitializeSWIFFTX(); + edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); + edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) ); + edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) ); + edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) ); + edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) ); + + mm512_intrlv80_8x64( vdata, edata ); + + *noncev = mm512_intrlv_blend_32( *noncev, + _mm512_set_epi32( 0, n+7, 0, n+6, 0, n+5, 0, n+4, + 0, n+3, 0, n+2, 0, n+1, 0, n ) ); + blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); - mm512_bswap32_intrlv80_8x64( vdata, pdata ); - *noncev = mm512_intrlv_blend_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { if ( x25x_8way_hash( hash, vdata, thr_id ) ); @@ -588,12 +597,11 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce, extr_lane_8x32( lane_hash, hash, lane, 256 ); if ( likely( valid_hash( lane_hash, ptarget ) ) ) { - pdata[19] = bswap_32( n + lane ); + pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm512_add_epi32( *noncev, - m512_const1_64( 0x0000000800000000 ) ); + *noncev = _mm512_add_epi32( *noncev, eight ); n += 8; } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); pdata[19] = n; @@ -637,8 +645,12 @@ union _x25x_4way_ctx_overlay panama_4way_context panama; blake2s_4way_state blake2s; }; + typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay; +static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64))); +static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64))); + int x25x_4way_hash( void *output, const void *input, int thrid ) { uint64_t vhash[8*4] __attribute__ ((aligned (128))); @@ -651,7 +663,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) uint64_t vhashB[8*4] __attribute__ ((aligned (64))); x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64))); - blake512_4way_full( &ctx.blake, vhash, input, 80 ); + blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ), + x25x_4way_midstate ); + dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash ); bmw512_4way_init( &ctx.bmw ); @@ -905,6 +919,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce, uint32_t hash[8*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); uint32_t lane_hash[8] __attribute__ ((aligned (64))); + __m128i edata[5] __attribute__ ((aligned (64))); uint32_t *hashd7 = &(hash[ 7*4 ]); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -914,15 +929,23 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const uint32_t targ32 = ptarget[7]; + const __m256i four = m256_const1_64( 4 ); const bool bench = opt_benchmark; if ( bench ) ptarget[7] = 0x08ff; - InitializeSWIFFTX(); + edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); + edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) ); + edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) ); + edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) ); + edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) ); - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - *noncev = mm256_intrlv_blend_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + mm256_intrlv80_4x64( vdata, edata ); + + *noncev = mm256_intrlv_blend_32( *noncev, + _mm256_set_epi32( 0, n+3, 0, n+2, 0, n+1, 0, n ) ); + blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata ); + do { if ( x25x_4way_hash( hash, vdata, thr_id ) ) @@ -932,12 +955,11 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce, extr_lane_4x32( lane_hash, hash, lane, 256 ); if ( valid_hash( lane_hash, ptarget ) ) { - pdata[19] = bswap_32( n + lane ); + pdata[19] = n + lane; submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm256_add_epi32( *noncev, - m256_const1_64( 0x0000000400000000 ) ); + *noncev = _mm256_add_epi32( *noncev, four ); n += 4; } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) ); pdata[19] = n; diff --git a/configure b/configure index 0f520e17..1e6054aa 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.7. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.8. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.19.7' -PACKAGE_STRING='cpuminer-opt 3.19.7' +PACKAGE_VERSION='3.19.8' +PACKAGE_STRING='cpuminer-opt 3.19.8' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.19.7 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.19.8 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.19.7:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.19.8:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.19.7 +cpuminer-opt configure 3.19.8 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.19.7, which was +It was created by cpuminer-opt $as_me 3.19.8, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.19.7' + VERSION='3.19.8' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.19.7, which was +This file was extended by cpuminer-opt $as_me 3.19.8, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.19.7 +cpuminer-opt config.status 3.19.8 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 11ee7cab..30234994 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.19.7]) +AC_INIT([cpuminer-opt], [3.19.8]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index d8af92d5..8fc6c7a0 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1099,7 +1099,7 @@ void report_summary_log( bool force ) sprintf_et( et_str, et.tv_sec ); sprintf_et( upt_str, uptime.tv_sec ); - applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], short_url ); + applog( LOG_BLUE, "%s: %s", algo_names[ opt_algo ], rpc_url ); applog2( LOG_NOTICE, "Periodic Report %s %s", et_str, upt_str ); applog2( LOG_INFO, "Share rate %.2f/min %.2f/min", submit_rate, safe_div( (double)submitted_share_count*60., @@ -2754,7 +2754,7 @@ static void *stratum_thread(void *userdata ) stratum.url = (char*) tq_pop(mythr->q, NULL); if (!stratum.url) goto out; - applog( LOG_BLUE, "Stratum connect %s", short_url ); + applog( LOG_BLUE, "Stratum connect %s", stratum.url ); while (1) { @@ -3335,6 +3335,7 @@ void parse_arg(int key, char *arg ) if ( strncasecmp( arg, "http://", 7 ) && strncasecmp( arg, "https://", 8 ) && strncasecmp( arg, "stratum+tcp://", 14 ) + && strncasecmp( arg, "stratum+ssl://", 14 ) && strncasecmp( arg, "stratum+tcps://", 15 ) ) { fprintf(stderr, "unknown protocol -- '%s'\n", arg); @@ -3768,6 +3769,7 @@ int main(int argc, char *argv[]) flags = CURL_GLOBAL_ALL; if ( !opt_benchmark ) if ( strncasecmp( rpc_url, "https:", 6 ) + && strncasecmp( rpc_url, "stratum+ssl://", 14 ) && strncasecmp( rpc_url, "stratum+tcps://", 15 ) ) flags &= ~CURL_GLOBAL_SSL; diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 00fb1516..372744ed 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -508,6 +508,32 @@ static inline void mm128_bswap32_80( void *d, void *s ) #endif +static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) +{ + uint32_t *s = (uint32_t*)src; + casti_m128i( d, 0 ) = _mm_set1_epi32( bswap_32( s[ 0] ) ); + casti_m128i( d, 1 ) = _mm_set1_epi32( bswap_32( s[ 1] ) ); + casti_m128i( d, 2 ) = _mm_set1_epi32( bswap_32( s[ 2] ) ); + casti_m128i( d, 3 ) = _mm_set1_epi32( bswap_32( s[ 3] ) ); + casti_m128i( d, 4 ) = _mm_set1_epi32( bswap_32( s[ 4] ) ); + casti_m128i( d, 5 ) = _mm_set1_epi32( bswap_32( s[ 5] ) ); + casti_m128i( d, 6 ) = _mm_set1_epi32( bswap_32( s[ 6] ) ); + casti_m128i( d, 7 ) = _mm_set1_epi32( bswap_32( s[ 7] ) ); + casti_m128i( d, 8 ) = _mm_set1_epi32( bswap_32( s[ 8] ) ); + casti_m128i( d, 9 ) = _mm_set1_epi32( bswap_32( s[ 9] ) ); + casti_m128i( d,10 ) = _mm_set1_epi32( bswap_32( s[10] ) ); + casti_m128i( d,11 ) = _mm_set1_epi32( bswap_32( s[11] ) ); + casti_m128i( d,12 ) = _mm_set1_epi32( bswap_32( s[12] ) ); + casti_m128i( d,13 ) = _mm_set1_epi32( bswap_32( s[13] ) ); + casti_m128i( d,14 ) = _mm_set1_epi32( bswap_32( s[14] ) ); + casti_m128i( d,15 ) = _mm_set1_epi32( bswap_32( s[15] ) ); + casti_m128i( d,16 ) = _mm_set1_epi32( bswap_32( s[16] ) ); + casti_m128i( d,17 ) = _mm_set1_epi32( bswap_32( s[17] ) ); + casti_m128i( d,18 ) = _mm_set1_epi32( bswap_32( s[18] ) ); + casti_m128i( d,19 ) = _mm_set1_epi32( bswap_32( s[19] ) ); +} + +/* static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) { __m128i s0 = casti_m128i( src,0 ); @@ -561,6 +587,7 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) casti_m128i( d,18 ) = _mm_shuffle_epi32( s4, 0xaa ); casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff ); } +*/ // 8x32 /* @@ -1110,6 +1137,31 @@ static inline void extr_lane_8x32( void *d, const void *s, #if defined(__AVX2__) +static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) +{ + uint32_t *s = (uint32_t*)src; + casti_m256i( d, 0 ) = _mm256_set1_epi32( bswap_32( s[ 0] ) ); + casti_m256i( d, 1 ) = _mm256_set1_epi32( bswap_32( s[ 1] ) ); + casti_m256i( d, 2 ) = _mm256_set1_epi32( bswap_32( s[ 2] ) ); + casti_m256i( d, 3 ) = _mm256_set1_epi32( bswap_32( s[ 3] ) ); + casti_m256i( d, 4 ) = _mm256_set1_epi32( bswap_32( s[ 4] ) ); + casti_m256i( d, 5 ) = _mm256_set1_epi32( bswap_32( s[ 5] ) ); + casti_m256i( d, 6 ) = _mm256_set1_epi32( bswap_32( s[ 6] ) ); + casti_m256i( d, 7 ) = _mm256_set1_epi32( bswap_32( s[ 7] ) ); + casti_m256i( d, 8 ) = _mm256_set1_epi32( bswap_32( s[ 8] ) ); + casti_m256i( d, 9 ) = _mm256_set1_epi32( bswap_32( s[ 9] ) ); + casti_m256i( d,10 ) = _mm256_set1_epi32( bswap_32( s[10] ) ); + casti_m256i( d,11 ) = _mm256_set1_epi32( bswap_32( s[11] ) ); + casti_m256i( d,12 ) = _mm256_set1_epi32( bswap_32( s[12] ) ); + casti_m256i( d,13 ) = _mm256_set1_epi32( bswap_32( s[13] ) ); + casti_m256i( d,14 ) = _mm256_set1_epi32( bswap_32( s[14] ) ); + casti_m256i( d,15 ) = _mm256_set1_epi32( bswap_32( s[15] ) ); + casti_m256i( d,16 ) = _mm256_set1_epi32( bswap_32( s[16] ) ); + casti_m256i( d,17 ) = _mm256_set1_epi32( bswap_32( s[17] ) ); + casti_m256i( d,18 ) = _mm256_set1_epi32( bswap_32( s[18] ) ); + casti_m256i( d,19 ) = _mm256_set1_epi32( bswap_32( s[19] ) ); +} +/* static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) { __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); @@ -1170,6 +1222,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) casti_m128i( d,38 ) = casti_m128i( d,39 ) = _mm_shuffle_epi32( s4 , 0xff ); } +*/ #endif // AVX2 @@ -1718,6 +1771,31 @@ static inline void extr_lane_16x32( void *d, const void *s, #if defined(__AVX512F__) && defined(__AVX512VL__) +static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) +{ + uint32_t *s = (uint32_t*)src; + casti_m512i( d, 0 ) = _mm512_set1_epi32( bswap_32( s[ 0] ) ); + casti_m512i( d, 1 ) = _mm512_set1_epi32( bswap_32( s[ 1] ) ); + casti_m512i( d, 2 ) = _mm512_set1_epi32( bswap_32( s[ 2] ) ); + casti_m512i( d, 3 ) = _mm512_set1_epi32( bswap_32( s[ 3] ) ); + casti_m512i( d, 4 ) = _mm512_set1_epi32( bswap_32( s[ 4] ) ); + casti_m512i( d, 5 ) = _mm512_set1_epi32( bswap_32( s[ 5] ) ); + casti_m512i( d, 6 ) = _mm512_set1_epi32( bswap_32( s[ 6] ) ); + casti_m512i( d, 7 ) = _mm512_set1_epi32( bswap_32( s[ 7] ) ); + casti_m512i( d, 8 ) = _mm512_set1_epi32( bswap_32( s[ 8] ) ); + casti_m512i( d, 9 ) = _mm512_set1_epi32( bswap_32( s[ 9] ) ); + casti_m512i( d,10 ) = _mm512_set1_epi32( bswap_32( s[10] ) ); + casti_m512i( d,11 ) = _mm512_set1_epi32( bswap_32( s[11] ) ); + casti_m512i( d,12 ) = _mm512_set1_epi32( bswap_32( s[12] ) ); + casti_m512i( d,13 ) = _mm512_set1_epi32( bswap_32( s[13] ) ); + casti_m512i( d,14 ) = _mm512_set1_epi32( bswap_32( s[14] ) ); + casti_m512i( d,15 ) = _mm512_set1_epi32( bswap_32( s[15] ) ); + casti_m512i( d,16 ) = _mm512_set1_epi32( bswap_32( s[16] ) ); + casti_m512i( d,17 ) = _mm512_set1_epi32( bswap_32( s[17] ) ); + casti_m512i( d,18 ) = _mm512_set1_epi32( bswap_32( s[18] ) ); + casti_m512i( d,19 ) = _mm512_set1_epi32( bswap_32( s[19] ) ); +} +/* static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) { __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); @@ -1818,6 +1896,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) casti_m128i( d,78 ) = casti_m128i( d,79 ) = _mm_shuffle_epi32( s4 , 0xff ); } +*/ #endif // AVX512 @@ -2470,6 +2549,25 @@ static inline void extr_lane_8x64( void *d, const void *s, #if defined(__AVX512F__) && defined(__AVX512VL__) +// broadcast to all lanes +static inline void mm512_intrlv80_8x64( void *dst, const void *src ) +{ + __m128i *d = (__m128i*)dst; + const __m128i *s = (const __m128i*)src; + + d[ 0] = d[ 1] = d[ 2] = d[ 3] = _mm_shuffle_epi32( s[0], 0x44 ); + d[ 4] = d[ 5] = d[ 6] = d[ 7] = _mm_shuffle_epi32( s[0], 0xee ); + d[ 8] = d[ 9] = d[10] = d[11] = _mm_shuffle_epi32( s[1], 0x44 ); + d[12] = d[13] = d[14] = d[15] = _mm_shuffle_epi32( s[1], 0xee ); + d[16] = d[17] = d[18] = d[19] = _mm_shuffle_epi32( s[2], 0x44 ); + d[20] = d[21] = d[22] = d[23] = _mm_shuffle_epi32( s[2], 0xee ); + d[24] = d[25] = d[26] = d[27] = _mm_shuffle_epi32( s[3], 0x44 ); + d[28] = d[29] = d[30] = d[31] = _mm_shuffle_epi32( s[3], 0xee ); + d[32] = d[33] = d[34] = d[35] = _mm_shuffle_epi32( s[4], 0x44 ); + d[36] = d[37] = d[38] = d[39] = _mm_shuffle_epi32( s[4], 0xee ); +} + +// byte swap and broadcast to al lanes static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) { __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 6867a3d9..134ebcba 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -15,13 +15,14 @@ // AVX512 intrinsics have a few changes from previous conventions. // -// cmp instruction now returns a bitmask isnstead of a vector mask. +// cmp instruction now returns a bitmask instead of a vector mask. // This eliminates the need for the blendv instruction. // // The new rotate instructions require the count to be an 8 bit // immediate value only. Compilation fails if a variable is used. // The documentation is the same as for shift and it works with -// variables. +// variables. The inconsistency is likely due to compiler optimizations +// that can eliminate the variable in some instances. // // _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute // usually shuffles accross all lanes. diff --git a/util.c b/util.c index b746ef9a..b2004610 100644 --- a/util.c +++ b/util.c @@ -1542,11 +1542,20 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url) free(sctx->url); sctx->url = strdup(url); } - free(sctx->curl_url); + + free(sctx->curl_url); sctx->curl_url = (char*) malloc(strlen(url)); - sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) - ? strstr( url, "s://" ) - : strstr (url, "://" ) ); + + // replace the stratum protocol prefix with http, https for ssl + sprintf( sctx->curl_url, "%s%s", + ( strstr( url, "s://" ) || strstr( url, "ssl://" ) ) + ? "https" : "http", strstr( url, "://" ) ); + + + +// sprintf( sctx->curl_url, "http%s", strstr( url, "s://" ) +// ? strstr( url, "s://" ) +// : strstr (url, "://" ) ); if (opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);