From 7a1389998b4f89e4b1bf4f9ab70e4956c2467a05 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Thu, 14 Dec 2017 18:28:51 -0500 Subject: [PATCH] v3.7.6 --- Makefile.am | 5 +- README.md | 1 + RELEASE_NOTES | 6 + algo-gate-api.c | 17 +- algo/blake/blake-4way.c | 24 +- algo/blake/blake-gate.c | 1 - algo/blake/blake-hash-4way.c | 63 +-- algo/blake/decred-4way.c | 99 ++-- algo/blake/sph_blake.c | 1 + algo/jh/jha-4way.c | 53 ++- algo/lyra2/lyra2h.c | 93 ++++ algo/lyra2/lyra2z-4way.c | 3 +- algo/lyra2/sponge.h | 8 +- algo/shavite/sph-shavite-aesni.c | 673 +++++++++++++++++++++++++++ algo/shavite/sph_shavite.c | 8 +- algo/shavite/sph_shavite.h | 78 ++-- algo/veltor.c | 2 +- algo/whirlpool/md_helper.c | 4 +- algo/whirlpool/whirlpool-4way.c | 11 +- algo/whirlpool/whirlpool-gate.c | 11 +- algo/whirlpool/whirlpool-gate.h | 11 +- algo/whirlpool/whirlpool-hash-4way.c | 2 + algo/x11/x11-gate.c | 18 + algo/x11/x11-gate.h | 30 ++ algo/x11/x11.c | 8 +- algo/yescrypt/yescrypt.c | 2 +- avxdefs.h | 286 +++++++----- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 107 +++-- miner.h | 3 + 31 files changed, 1279 insertions(+), 371 deletions(-) create mode 100644 algo/lyra2/lyra2h.c create mode 100644 algo/shavite/sph-shavite-aesni.c create mode 100644 algo/x11/x11-gate.c create mode 100644 algo/x11/x11-gate.h diff --git a/Makefile.am b/Makefile.am index 3b6bdc86..ec741307 100644 --- a/Makefile.am +++ b/Makefile.am @@ -108,6 +108,7 @@ cpuminer_SOURCES = \ algo/lyra2/lyra2z.c \ algo/lyra2/lyra2z-4way.c \ algo/lyra2/lyra2z330.c \ + algo/lyra2/lyra2h.c \ algo/m7m.c \ algo/neoscrypt.c \ algo/nist5/nist5-gate.c \ @@ -128,6 +129,7 @@ cpuminer_SOURCES = \ algo/sha/sha256t.c \ algo/shabal/sph_shabal.c \ algo/shavite/sph_shavite.c \ + algo/shavite/sph-shavite-aesni.c \ algo/shavite/shavite.c \ algo/simd/sph_simd.c \ algo/simd/sse2/nist.c \ @@ -155,11 +157,12 @@ cpuminer_SOURCES = \ algo/whirlpool/whirlpool-4way.c \ algo/whirlpool/whirlpool.c \ algo/whirlpool/whirlpoolx.c \ - algo/x11/phi1612.c \ + algo/x11/x11-gate.c \ algo/x11/x11.c \ algo/x11/x11evo.c \ algo/x11/x11gost.c \ algo/x11/c11.c \ + algo/x11/phi1612.c \ algo/x13/x13.c \ algo/x13/x13sm3.c \ algo/x14/x14.c \ diff --git a/README.md b/README.md index 053d72a1..054e8235 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Supported Algorithms keccakc Creative coin lbry LBC, LBRY Credits luffa Luffa + lyra2h Hppcoin lyra2re lyra2 lyra2rev2 lyra2v2, Vertcoin lyra2z Zcoin (XZC) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index ed91d398..16b3d506 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -164,6 +164,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble. Change Log ---------- +v3.7.6 + +Added kyra2h algo for Hppcoin. +Added support for more than 64 CPUs. +Optimized shavite512 with AES, improves x11 etc. + v3.7.5 New algo keccakc for Creative coin with 4way optimizations diff --git a/algo-gate-api.c b/algo-gate-api.c index 7409b006..d33ee08a 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -138,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate ) gate->work_cmp_size = STD_WORK_CMP_SIZE; } +// Ignore warnings for not yet defined register functions +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-function-declaration" + // called by each thread that uses the gate bool register_algo_gate( int algo, algo_gate_t *gate ) { @@ -151,11 +155,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) switch (algo) { - -// Ignore warnings for not yet defined register fucntions -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wimplicit-function-declaration" - case ALGO_ARGON2: register_argon2_algo ( gate ); break; case ALGO_AXIOM: register_axiom_algo ( gate ); break; case ALGO_BASTION: register_bastion_algo ( gate ); break; @@ -180,6 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_KECCAKC: register_keccakc_algo ( gate ); break; case ALGO_LBRY: register_lbry_algo ( gate ); break; case ALGO_LUFFA: register_luffa_algo ( gate ); break; + case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break; case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break; case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break; case ALGO_LYRA2Z: register_lyra2z_algo ( gate ); break; @@ -221,10 +221,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break; case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break; case ALGO_ZR5: register_zr5_algo ( gate ); break; - -// restore warnings -#pragma GCC diagnostic pop - default: applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] ); return false; @@ -239,6 +235,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) return true; } +// restore warnings +#pragma GCC diagnostic pop + // override std defaults with jr2 defaults bool register_json_rpc2( algo_gate_t *gate ) { diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index 35eab389..70b51a3d 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -32,12 +32,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done ) { uint32_t vdata[20*4] __attribute__ ((aligned (64))); - uint32_t hash[4*4] __attribute__ ((aligned (32))); + uint32_t hash[8*4] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; // uint32_t HTarget = ptarget[7]; - uint32_t _ALIGN(32) endiandata[20]; + uint32_t _ALIGN(32) edata[20]; uint32_t n = first_nonce; uint32_t *nonces = work->nonces; bool *found = work->nfound; @@ -47,18 +47,17 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, // HTarget = 0x7f; // we need big endian data... - swab32_array( endiandata, pdata, 20 ); + swab32_array( edata, pdata, 20 ); - mm_interleave_4x32( vdata, endiandata, endiandata, endiandata, - endiandata, 640 ); + mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 ); uint32_t *noncep = vdata + 76; // 19*4 do { found[0] = found[1] = found[2] = found[3] = false; be32enc( noncep, n ); - be32enc( noncep +2, n+1 ); - be32enc( noncep +4, n+2 ); - be32enc( noncep +6, n+3 ); + be32enc( noncep +1, n+1 ); + be32enc( noncep +2, n+2 ); + be32enc( noncep +3, n+3 ); blakehash_4way( hash, vdata ); @@ -74,7 +73,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, } if ( (hash+8)[7] == 0 ) { - if ( fulltest( hash, ptarget ) ) + if ( fulltest( hash+8, ptarget ) ) { found[1] = true; num_found++; @@ -83,7 +82,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, } if ( (hash+16)[7] == 0 ) { - if ( fulltest( hash, ptarget ) ) + if ( fulltest( hash+8, ptarget ) ) { found[2] = true; num_found++; @@ -92,15 +91,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce, } if ( (hash+24)[7] == 0 ) { - if ( fulltest( hash, ptarget ) ) + if ( fulltest( hash+8, ptarget ) ) { found[3] = true; num_found++; nonces[3] = n+3; } } - - n += 4; + n += 4; *hashes_done = n - first_nonce + 1; } while ( (num_found == 0) && (n < max_nonce) diff --git a/algo/blake/blake-gate.c b/algo/blake/blake-gate.c index f6f66278..b050ff47 100644 --- a/algo/blake/blake-gate.c +++ b/algo/blake/blake-gate.c @@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate ) gate->optimizations = FOUR_WAY_OPT; gate->scanhash = (void*)&scanhash_blake_4way; gate->hash = (void*)&blakehash_4way; - four_way_not_tested(); #else gate->scanhash = (void*)&scanhash_blake; gate->hash = (void*)&blakehash; diff --git a/algo/blake/blake-hash-4way.c b/algo/blake/blake-hash-4way.c index 161a4142..b89952c8 100644 --- a/algo/blake/blake-hash-4way.c +++ b/algo/blake/blake-hash-4way.c @@ -524,18 +524,18 @@ do { \ V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \ - V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \ - VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \ - VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \ - VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \ - _mmset_epi32( CS4, CS4, CS4, CS4 ) ); \ - VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \ - _mmset_epi32( CS5, CS5, CS5, CS5 ) ); \ - VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \ - , _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \ - VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \ - _mmset_epi32( CS7, CS7, CS7, CS7 ) ); \ + V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \ + V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \ + VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \ + VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \ + VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \ + _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \ + VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \ + _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \ + VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \ + , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \ + VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \ + _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \ M[0x0] = mm_byteswap_32( *(buf + 0) ); \ M[0x1] = mm_byteswap_32( *(buf + 1) ); \ M[0x2] = mm_byteswap_32( *(buf + 2) ); \ @@ -710,18 +710,18 @@ do { \ V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \ - V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \ - VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \ - VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \ - VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \ - _mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \ - VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \ - _mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \ - VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \ - _mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \ - VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \ - _mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \ + V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \ + V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \ + VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \ + VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \ + VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \ + _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \ + VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \ + _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \ + VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ + _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ + VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ + _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ M[0x0] = mm256_byteswap_64( *(buf+0) ); \ M[0x1] = mm256_byteswap_64( *(buf+1) ); \ M[0x2] = mm256_byteswap_64( *(buf+2) ); \ @@ -867,7 +867,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len ) buf = sc->buf; ptr = sc->ptr; - if ( len < buf_size - ptr ) { memcpy_128( buf + (ptr>>2), vdata, len>>2 ); @@ -915,9 +914,10 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n, ptr = sc->ptr; bit_len = ((unsigned)ptr << 3); - unsigned z = 0x80 >> n; - unsigned zz = ((ub & -z) | z) & 0xFF; - u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz ); +// unsigned z = 0x80 >> n; +// unsigned zz = ((ub & -z) | z) & 0xFF; +// u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz ); + u.buf[ptr>>2] = _mm_set1_epi32( 0x80 ); tl = sc->T0 + bit_len; th = sc->T1; @@ -934,9 +934,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n, else sc->T0 -= 512 - bit_len; - if ( ptr <= 48 ) +// if ( ptr <= 48 ) + if ( ptr <= 52 ) { - memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 ); + memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 ); +// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 ); if (out_size_w32 == 8) u.buf[52>>2] = _mm_or_si128( u.buf[52>>2], _mm_set_epi32( 0x010000000, 0x01000000, @@ -962,6 +964,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n, out = (__m128i*)dst; for ( k = 0; k < out_size_w32; k++ ) out[k] = mm_byteswap_32( sc->H[k] ); +// out[k] = sc->H[k]; } #if defined (__AVX2__) diff --git a/algo/blake/decred-4way.c b/algo/blake/decred-4way.c index 825edadf..de65b722 100644 --- a/algo/blake/decred-4way.c +++ b/algo/blake/decred-4way.c @@ -13,46 +13,35 @@ static __thread bool ctx_midstate_done = false; void decred_hash_4way( void *state, const void *input ) { - uint32_t vhash[4*4] __attribute__ ((aligned (64))); - uint32_t hash0[4] __attribute__ ((aligned (32))); - uint32_t hash1[4] __attribute__ ((aligned (32))); - uint32_t hash2[4] __attribute__ ((aligned (32))); - uint32_t hash3[4] __attribute__ ((aligned (32))); + uint32_t vhash[8*4] __attribute__ ((aligned (64))); + uint32_t hash0[8] __attribute__ ((aligned (32))); + uint32_t hash1[8] __attribute__ ((aligned (32))); + uint32_t hash2[8] __attribute__ ((aligned (32))); + uint32_t hash3[8] __attribute__ ((aligned (32))); blake256_4way_context ctx __attribute__ ((aligned (64))); sph_blake256_context ctx2 __attribute__ ((aligned (64))); uint32_t hash[16] __attribute__ ((aligned (64))); uint32_t sin0[45], sin1[45], sin2[45], sin3[45]; + mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 ); - void *tail = input + DECRED_MIDSTATE_LEN; + void *tail = input + ( DECRED_MIDSTATE_LEN << 2 ); int tail_len = 180 - DECRED_MIDSTATE_LEN; -// #define MIDSTATE_LEN 128 -/* - uint8_t *ending = (uint8_t*) input; - ending += MIDSTATE_LEN; - - if ( !ctx_midstate_done ) - { - blake256_4way_init( &blake_mid ); - blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN ); - ctx_midstate_done = true; - } - memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); + memcpy( &ctx, &blake_mid, sizeof(blake_mid) ); blake256_4way( &ctx, tail, tail_len ); blake256_4way_close( &ctx, vhash ); -*/ - - +/* sph_blake256_init( &ctx2 ); sph_blake256( &ctx2, sin0, 180 ); sph_blake256_close( &ctx2, hash ); - +*/ +/* blake256_4way_init( &ctx ); blake256_4way( &ctx, input, 180 ); blake256_4way_close( &ctx, vhash ); - +*/ mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); /* for ( int i = 0; i < 8; i++ ) @@ -66,22 +55,21 @@ printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1), printf("\n"); */ -// memcpy( state, hash0, 32 ); -// memcpy( state+32, hash1, 32 ); -// memcpy( state+64, hash1, 32 ); -// memcpy( state+96, hash1, 32 ); + memcpy( state, hash0, 32 ); + memcpy( state+32, hash1, 32 ); + memcpy( state+64, hash2, 32 ); + memcpy( state+96, hash3, 32 ); - memcpy( state, hash, 32 ); +// memcpy( state, hash, 32 ); } int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done) { - uint32_t vdata[45*4] __attribute__ ((aligned (64))); - uint32_t hash[4*4] __attribute__ ((aligned (32))); - uint32_t _ALIGN(64) endiandata[48]; -// uint32_t _ALIGN(64) hash32[8]; + uint32_t vdata[48*4] __attribute__ ((aligned (64))); + uint32_t hash[8*4] __attribute__ ((aligned (32))); + uint32_t _ALIGN(64) edata[48]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX]; @@ -91,28 +79,25 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, bool *found = work->nfound; int num_found = 0; -// #define DCR_NONCE_OFT32 35 - ctx_midstate_done = false; - -// memcpy(endiandata, pdata, 180); + memcpy( edata, pdata, 180 ); // use the old way until new way updated for size. - mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 ); + mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 ); + + blake256_4way_init( &blake_mid ); + blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN ); uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4; do { found[0] = found[1] = found[2] = found[3] = false; * noncep = n; - *(noncep+2) = n+1; - *(noncep+4) = n+2; - *(noncep+6) = n+3; + *(noncep+1) = n+1; + *(noncep+2) = n+2; + *(noncep+3) = n+3; decred_hash_4way( hash, vdata ); -// endiandata[DCR_NONCE_OFT32] = n; -// decred_hash(hash32, endiandata); - if ( hash[7] <= HTarget && fulltest( hash, ptarget ) ) { work_set_target_ratio( work, hash ); @@ -121,29 +106,47 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce, nonces[0] = n; pdata[DECRED_NONCE_INDEX] = n; } -/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) +/* + if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) { +printf("found 1\n"); + +printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] ); +printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] ); +printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] ); +printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] ); + work_set_target_ratio( work, hash+8 ); found[1] = true; num_found++; - nonces[1] = n; + nonces[1] = n+1; } +*/ if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) ) { work_set_target_ratio( work, hash+16 ); found[2] = true; num_found++; - nonces[2] = n; + nonces[2] = n+2; } +/* if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) ) { +printf("found 3\n"); + +printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] ); +printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] ); +printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] ); +printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] ); + work_set_target_ratio( work, hash+24 ); found[3] = true; num_found++; - nonces[3] = n; + nonces[3] = n+3; } */ - n += 4; + n += 2; +// n += 4; } while ( (num_found == 0) && (n < max_nonce) && !work_restart[thr_id].restart ); diff --git a/algo/blake/sph_blake.c b/algo/blake/sph_blake.c index ad1680cf..8c7c3b91 100644 --- a/algo/blake/sph_blake.c +++ b/algo/blake/sph_blake.c @@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc, } else { sc->T0 -= 512 - bit_len; } + if (bit_len <= 446) { memset(u.buf + ptr + 1, 0, 55 - ptr); if (out_size_w32 == 8) diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c index d735bd7c..6f4dea25 100644 --- a/algo/jh/jha-4way.c +++ b/algo/jh/jha-4way.c @@ -25,7 +25,7 @@ void jha_hash_4way( void *out, const void *input ) uint64_t vhash[8*4] __attribute__ ((aligned (64))); uint64_t vhash0[8*4] __attribute__ ((aligned (64))); uint64_t vhash1[8*4] __attribute__ ((aligned (64))); - __m256i mask0, mask1; + __m256i mask, mask0, mask1; __m256i* vh = (__m256i*)vhash; __m256i* vh0 = (__m256i*)vhash0; __m256i* vh1 = (__m256i*)vhash1; @@ -47,38 +47,37 @@ void jha_hash_4way( void *out, const void *input ) // Heavy & Light Pair Loop for ( int round = 0; round < 3; round++ ) { -// memset_zero_256( vh0, 20 ); -// memset_zero_256( vh1, 20 ); - - // positive logic, if maski select vhi - // going from bit to mask reverses logic such that if the test bit is set - // zero will be put in mask0, meaning don't take vh0. mask1 is - // inverted so 1 will be put in mask1 meaning take it. - mask0 = mm256_negate_64( + // select next function based on bit 0 of previous hash. + // Specutively execute both functions and use mask to + // select results from correct function for each lane. + // hash = mask : vhash0 ? vhash1 + mask = mm256_negate_64( _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) ); - mask1 = mm256_not( mask0 ); +// second version +// mask0 = mask +// mask1 = mm256_not( mask ); + +// first version // mask = _mm256_sub_epi64( _mm256_and_si256( vh[0], // _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) ); - // groestl (serial) v skein + // groestl (serial) vs skein mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash0, - (char*)hash0, 512 ); - + (char*)hash0, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash1, - (char*)hash1, 512 ); - + (char*)hash1, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash2, - (char*)hash2, 512 ); + (char*)hash2, 512 ); init_groestl( &ctx_groestl, 64 ); update_and_final_groestl( &ctx_groestl, (char*)hash3, - (char*)hash3, 512 ); + (char*)hash3, 512 ); mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 ); @@ -91,14 +90,20 @@ void jha_hash_4way( void *out, const void *input ) // merge vectored hash for ( int i = 0; i < 8; i++ ) { - vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ), - _mm256_and_si256( vh1[i], mask1 ) ); + // blend should be faster + vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask ); + +// second version +// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ), +// _mm256_and_si256( vh1[i], mask1 ) ); + +// first version /* - vha256[i] = _mm256_maskload_epi64( - vhasha + i*4, mm256_not( mask ) ); - vhb256[i] = _mm256_maskload_epi64( - vhashb + i*4, mask ); - vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] ); + vh0[i] = _mm256_maskload_epi64( + vhash0 + i*4, mm256_not( mask ) ); + vh1[i] = _mm256_maskload_epi64( + vhash1 + i*4, mask ); + vh[i] = _mm256_or_si256( vh0[i], vh1[i] ); */ } diff --git a/algo/lyra2/lyra2h.c b/algo/lyra2/lyra2h.c new file mode 100644 index 00000000..85b10a62 --- /dev/null +++ b/algo/lyra2/lyra2h.c @@ -0,0 +1,93 @@ +#include +#include +#include "algo-gate-api.h" +#include "lyra2.h" +#include "algo/blake/sph_blake.h" + +__thread uint64_t* lyra2h_matrix; + +bool lyra2h_thread_init() +{ + const int i = 16 * 16 * 96; + lyra2h_matrix = _mm_malloc( i, 64 ); + return lyra2h_matrix; +} + +static __thread sph_blake256_context lyra2h_blake_mid; + +void lyra2h_midstate( const void* input ) +{ + sph_blake256_init( &lyra2h_blake_mid ); + sph_blake256( &lyra2h_blake_mid, input, 64 ); +} + +void lyra2h_hash( void *state, const void *input ) +{ + uint32_t _ALIGN(64) hash[16]; + + sph_blake256_context ctx_blake __attribute__ ((aligned (64))); + + memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid ); + sph_blake256( &ctx_blake, input + 64, 16 ); + sph_blake256_close( &ctx_blake, hash ); + + LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8); + + memcpy(state, hash, 32); +} + +int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t _ALIGN(64) hash[8]; + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t nonce = first_nonce; + + if (opt_benchmark) + ptarget[7] = 0x0000ff; + + for (int i=0; i < 19; i++) { + be32enc(&endiandata[i], pdata[i]); + } + + lyra2h_midstate( endiandata ); + + do { + be32enc(&endiandata[19], nonce); + lyra2h_hash( hash, endiandata ); + + if (hash[7] <= Htarg && fulltest(hash, ptarget)) { + work_set_target_ratio(work, hash); + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + return 1; + } + nonce++; + + } while (nonce < max_nonce && !work_restart[thr_id].restart); + + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} + +void lyra2h_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + +bool register_lyra2h_algo( algo_gate_t* gate ) +{ + gate->optimizations = AVX_OPT | AVX2_OPT; + gate->miner_thread_init = (void*)&lyra2h_thread_init; + gate->scanhash = (void*)&scanhash_lyra2h; + gate->hash = (void*)&lyra2h_hash; + gate->get_max64 = (void*)&get_max64_0xffffLL; + gate->set_target = (void*)&lyra2h_set_target; + return true; +}; + diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 7ee4b374..97ea4365 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -110,7 +110,8 @@ printf("found 0\n"); nonces[0] = pdata[19] = n; work_set_target_ratio( work, hash ); } -/* if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) +/* + if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) { printf("found 1\n"); found[1] = true; diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 46e93a6c..947a1061 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ G_4X64( s0, s1, s2, s3 ); \ - s1 = mm256_rotl256_1x64( s1); \ + s1 = mm256_rotr256_1x64( s1); \ s2 = mm256_swap_128( s2 ); \ - s3 = mm256_rotr256_1x64( s3 ); \ + s3 = mm256_rotl256_1x64( s3 ); \ G_4X64( s0, s1, s2, s3 ); \ - s1 = mm256_rotr256_1x64( s1 ); \ + s1 = mm256_rotl256_1x64( s1 ); \ s2 = mm256_swap_128( s2 ); \ - s3 = mm256_rotl256_1x64( s3 ); + s3 = mm256_rotr256_1x64( s3 ); #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c new file mode 100644 index 00000000..35380ca2 --- /dev/null +++ b/algo/shavite/sph-shavite-aesni.c @@ -0,0 +1,673 @@ +/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SHAvite-3 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ +#include +#include +#include + +#ifdef __AES__ + +#include "sph_shavite.h" +#include "avxdefs.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE +#define SPH_SMALL_FOOTPRINT_SHAVITE 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define C32 SPH_C32 + +/* + * As of round 2 of the SHA-3 competition, the published reference + * implementation and test vectors are wrong, because they use + * big-endian AES tables while the internal decoding uses little-endian. + * The code below follows the specification. To turn it into a code + * which follows the reference implementation (the one called "BugFix" + * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out + * the code below (from the '#define AES_BIG_ENDIAN...' to the definition + * of the AES_ROUND_NOKEY macro) and replace it with the version which + * is commented out afterwards. + */ + +#define AES_BIG_ENDIAN 0 +#include "algo/sha/aes_helper.c" + +static const sph_u32 IV512[] = { + C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC), + C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC), + C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47), + C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + + +#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \ + sph_u32 kt; \ + AES_ROUND_NOKEY(k1, k2, k3, k0); \ + kt = (k0); \ + (k0) = (k1); \ + (k1) = (k2); \ + (k2) = (k3); \ + (k3) = kt; \ + } while (0) + + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 rk[448]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 128); +#else + for (u = 0; u < 32; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 32; + for (;;) { + for (s = 0; s < 4; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 32) { + rk[ 32] ^= sc->count0; + rk[ 33] ^= sc->count1; + rk[ 34] ^= sc->count2; + rk[ 35] ^= SPH_T32(~sc->count3); + } else if (u == 440) { + rk[440] ^= sc->count1; + rk[441] ^= sc->count0; + rk[442] ^= sc->count3; + rk[443] ^= SPH_T32(~sc->count2); + } + u += 4; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 164) { + rk[164] ^= sc->count3; + rk[165] ^= sc->count2; + rk[166] ^= sc->count1; + rk[167] ^= SPH_T32(~sc->count0); + } else if (u == 316) { + rk[316] ^= sc->count2; + rk[317] ^= sc->count3; + rk[318] ^= sc->count0; + rk[319] ^= SPH_T32(~sc->count1); + } + u += 4; + } + if (u == 448) + break; + for (s = 0; s < 8; s ++) { + rk[u + 0] = rk[u - 32] ^ rk[u - 7]; + rk[u + 1] = rk[u - 31] ^ rk[u - 6]; + rk[u + 2] = rk[u - 30] ^ rk[u - 5]; + rk[u + 3] = rk[u - 29] ^ rk[u - 4]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + u = 0; + for (r = 0; r < 14; r ++) { +#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \ + sph_u32 x0, x1, x2, x3; \ + x0 = r0 ^ rk[u ++]; \ + x1 = r1 ^ rk[u ++]; \ + x2 = r2 ^ rk[u ++]; \ + x3 = r3 ^ rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + l0 ^= x0; \ + l1 ^= x1; \ + l2 ^= x2; \ + l3 ^= x3; \ + } while (0) + +#define WROT(a, b, c, d) do { \ + sph_u32 t = d; \ + d = c; \ + c = b; \ + b = a; \ + a = t; \ + } while (0) + + C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7); + C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF); + + WROT(p0, p4, p8, pC); + WROT(p1, p5, p9, pD); + WROT(p2, p6, pA, pE); + WROT(p3, p7, pB, pF); + +#undef C512_ELT +#undef WROT + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; + sc->h[0x8] ^= p8; + sc->h[0x9] ^= p9; + sc->h[0xA] ^= pA; + sc->h[0xB] ^= pB; + sc->h[0xC] ^= pC; + sc->h[0xD] ^= pD; + sc->h[0xE] ^= pE; + sc->h[0xF] ^= pF; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512( sph_shavite_big_context *sc, const void *msg ) +{ + __m128i p0, p1, p2, p3, x; + __m128i k00, k01, k02, k03, k10, k11, k12, k13; + __m128i *m = (__m128i*)msg; + __m128i *h = (__m128i*)sc->h; + int r; + + p0 = h[0]; + p1 = h[1]; + p2 = h[2]; + p3 = h[3]; + + // round + k00 = m[0]; + x = _mm_xor_si128( p1, k00 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k01 = m[1]; + x = _mm_xor_si128( x, k01 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k02 = m[2]; + x = _mm_xor_si128( x, k02 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k03 = m[3]; + x = _mm_xor_si128( x, k03 ); + x = _mm_aesenc_si128( x, mm_zero ); + p0 = _mm_xor_si128( p0, x ); + + k10 = m[4]; + x = _mm_xor_si128( p3, k10 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k11 = m[5]; + x = _mm_xor_si128( x, k11 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k12 = m[6]; + x = _mm_xor_si128( x, k12 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k13 = m[7]; + x = _mm_xor_si128( x, k13 ); + x = _mm_aesenc_si128( x, mm_zero ); + p2 = _mm_xor_si128( p2, x ); + + for ( r = 0; r < 3; r ++ ) + { + // round 1, 5, 9 + k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); + k00 = _mm_xor_si128( k00, k13 ); + + if ( r == 0 ) + k00 = _mm_xor_si128( k00, _mm_set_epi32( + ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); + + x = _mm_xor_si128( p0, k00 ); + x = _mm_aesenc_si128( x, mm_zero ); + k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); + k01 = _mm_xor_si128( k01, k00 ); + + if ( r == 1 ) + k01 = _mm_xor_si128( k01, _mm_set_epi32( + ~sc->count0, sc->count1, sc->count2, sc->count3 ) ); + + x = _mm_xor_si128( x, k01 ); + x = _mm_aesenc_si128( x, mm_zero ); + k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); + k02 = _mm_xor_si128( k02, k01 ); + + x = _mm_xor_si128( x, k02 ); + x = _mm_aesenc_si128( x, mm_zero ); + k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); + k03 = _mm_xor_si128( k03, k02 ); + + x = _mm_xor_si128( x, k03 ); + x = _mm_aesenc_si128( x, mm_zero ); + p3 = _mm_xor_si128( p3, x ); + k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); + k10 = _mm_xor_si128( k10, k03 ); + + x = _mm_xor_si128( p2, k10 ); + x = _mm_aesenc_si128( x, mm_zero ); + k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); + k11 = _mm_xor_si128( k11, k10 ); + + x = _mm_xor_si128( x, k11 ); + x = _mm_aesenc_si128( x, mm_zero ); + k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); + k12 = _mm_xor_si128( k12, k11 ); + + x = _mm_xor_si128( x, k12 ); + x = _mm_aesenc_si128( x, mm_zero ); + k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); + k13 = _mm_xor_si128( k13, k12 ); + + if ( r == 2 ) + k13 = _mm_xor_si128( k13, _mm_set_epi32( + ~sc->count1, sc->count0, sc->count3, sc->count2 ) ); + + x = _mm_xor_si128( x, k13 ); + x = _mm_aesenc_si128( x, mm_zero ); + p1 = _mm_xor_si128( p1, x ); + + // round 2, 6, 10 + + k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) ); + x = _mm_xor_si128( p3, k00 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) ); + x = _mm_xor_si128( x, k01 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) ); + x = _mm_xor_si128( x, k02 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) ); + x = _mm_xor_si128( x, k03 ); + x = _mm_aesenc_si128( x, mm_zero ); + + p2 = _mm_xor_si128( p2, x ); + k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) ); + x = _mm_xor_si128( p1, k10 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) ); + x = _mm_xor_si128( x, k11 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) ); + x = _mm_xor_si128( x, k12 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) ); + x = _mm_xor_si128( x, k13 ); + x = _mm_aesenc_si128( x, mm_zero ); + p0 = _mm_xor_si128( p0, x ); + + // round 3, 7, 11 + + k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); + k00 = _mm_xor_si128( k00, k13 ); + + x = _mm_xor_si128( p2, k00 ); + x = _mm_aesenc_si128( x, mm_zero ); + + k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); + k01 = _mm_xor_si128( k01, k00 ); + + x = _mm_xor_si128( x, k01 ); + x = _mm_aesenc_si128( x, mm_zero ); + k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); + k02 = _mm_xor_si128( k02, k01 ); + + x = _mm_xor_si128( x, k02 ); + x = _mm_aesenc_si128( x, mm_zero ); + k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); + k03 = _mm_xor_si128( k03, k02 ); + + x = _mm_xor_si128( x, k03 ); + x = _mm_aesenc_si128( x, mm_zero ); + p1 = _mm_xor_si128( p1, x ); + k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); + k10 = _mm_xor_si128( k10, k03 ); + + x = _mm_xor_si128( p0, k10 ); + x = _mm_aesenc_si128( x, mm_zero ); + k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); + k11 = _mm_xor_si128( k11, k10 ); + + x = _mm_xor_si128( x, k11 ); + x = _mm_aesenc_si128( x, mm_zero ); + k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); + k12 = _mm_xor_si128( k12, k11 ); + + x = _mm_xor_si128( x, k12 ); + x = _mm_aesenc_si128( x, mm_zero ); + k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); + k13 = _mm_xor_si128( k13, k12 ); + + x = _mm_xor_si128( x, k13 ); + x = _mm_aesenc_si128( x, mm_zero ); + p3 = _mm_xor_si128( p3, x ); + + // round 4, 8, 12 + + k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) ); + + x = _mm_xor_si128( p1, k00 ); + x = _mm_aesenc_si128( x, mm_zero ); + k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) ); + + x = _mm_xor_si128( x, k01 ); + x = _mm_aesenc_si128( x, mm_zero ); + k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) ); + + x = _mm_xor_si128( x, k02 ); + x = _mm_aesenc_si128( x, mm_zero ); + k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) ); + + x = _mm_xor_si128( x, k03 ); + x = _mm_aesenc_si128( x, mm_zero ); + p0 = _mm_xor_si128( p0, x ); + k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) ); + + x = _mm_xor_si128( p3, k10 ); + x = _mm_aesenc_si128( x, mm_zero ); + k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) ); + + x = _mm_xor_si128( x, k11 ); + x = _mm_aesenc_si128( x, mm_zero ); + k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) ); + + x = _mm_xor_si128( x, k12 ); + x = _mm_aesenc_si128( x, mm_zero ); + k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) ); + + x = _mm_xor_si128( x, k13 ); + x = _mm_aesenc_si128( x, mm_zero ); + p2 = _mm_xor_si128( p2, x ); + } + + // round 13 + + k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); + k00 = _mm_xor_si128( k00, k13 ); + + x = _mm_xor_si128( p0, k00 ); + x = _mm_aesenc_si128( x, mm_zero ); + k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); + k01 = _mm_xor_si128( k01, k00 ); + + x = _mm_xor_si128( x, k01 ); + x = _mm_aesenc_si128( x, mm_zero ); + k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); + k02 = _mm_xor_si128( k02, k01 ); + + x = _mm_xor_si128( x, k02 ); + x = _mm_aesenc_si128( x, mm_zero ); + k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); + k03 = _mm_xor_si128( k03, k02 ); + + x = _mm_xor_si128( x, k03 ); + x = _mm_aesenc_si128( x, mm_zero ); + p3 = _mm_xor_si128( p3, x ); + k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); + k10 = _mm_xor_si128( k10, k03 ); + + x = _mm_xor_si128( p2, k10 ); + x = _mm_aesenc_si128( x, mm_zero ); + k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); + k11 = _mm_xor_si128( k11, k10 ); + + x = _mm_xor_si128( x, k11 ); + x = _mm_aesenc_si128( x, mm_zero ); + k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); + k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( + ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); + + x = _mm_xor_si128( x, k12 ); + x = _mm_aesenc_si128( x, mm_zero ); + k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); + k13 = _mm_xor_si128( k13, k12 ); + + x = _mm_xor_si128( x, k13 ); + x = _mm_aesenc_si128( x, mm_zero ); + p1 = _mm_xor_si128( p1, x ); + + h[0] = _mm_xor_si128( h[0], p2 ); + h[1] = _mm_xor_si128( h[1], p3 ); + h[2] = _mm_xor_si128( h[2], p0 ); + h[3] = _mm_xor_si128( h[3], p1 ); +} + +#endif + +static void +shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv ) +{ + memcpy( sc->h, iv, sizeof sc->h ); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; + sc->count2 = 0; + sc->count3 = 0; +} + +static void +shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data, + size_t len ) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) { + sc->count1 = SPH_T32(sc->count1 + 1); + if (sc->count1 == 0) { + sc->count2 = SPH_T32(sc->count2 + 1); + if (sc->count2 == 0) { + sc->count3 = SPH_T32( + sc->count3 + 1); + } + } + } + c512(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32 ) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1, count2, count3; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += SPH_T32(ptr << 3) + n); + count1 = sc->count1; + count2 = sc->count2; + count3 = sc->count3; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 109); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } else if (ptr < 110) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 110 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 128 - ptr); + c512(sc, buf); + memset(buf, 0, 110); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } + sph_enc32le(buf + 110, count0); + sph_enc32le(buf + 114, count1); + sph_enc32le(buf + 118, count2); + sph_enc32le(buf + 122, count3); + buf[126] = (unsigned char) (out_size_w32 << 5); + buf[127] = (unsigned char) (out_size_w32 >> 3); + c512(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +void +sph_shavite512_aesni_init(void *cc) +{ + shavite_big_aesni_init(cc, IV512); +} + +void +sph_shavite512_aesni(void *cc, const void *data, size_t len) +{ + shavite_big_aesni_core(cc, data, len); +} + +void +sph_shavite512_aesni_close(void *cc, void *dst) +{ + shavite_big_aesni_close(cc, 0, 0, dst, 16); +} + +void +sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n, + void *dst) +{ + shavite_big_aesni_close(cc, ub, n, dst, 16); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/shavite/sph_shavite.c b/algo/shavite/sph_shavite.c index b1ebfefb..ba4384b4 100644 --- a/algo/shavite/sph_shavite.c +++ b/algo/shavite/sph_shavite.c @@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) /* see sph_shavite.h */ void -sph_shavite512_init(void *cc) +sph_shavite512_sw_init(void *cc) { shavite_big_init(cc, IV512); } /* see sph_shavite.h */ void -sph_shavite512(void *cc, const void *data, size_t len) +sph_shavite512_sw(void *cc, const void *data, size_t len) { shavite_big_core(cc, data, len); } /* see sph_shavite.h */ void -sph_shavite512_close(void *cc, void *dst) +sph_shavite512_sw_close(void *cc, void *dst) { shavite_big_close(cc, 0, 0, dst, 16); // shavite_big_init(cc, IV512); @@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst) /* see sph_shavite.h */ void -sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { shavite_big_close(cc, ub, n, dst, 16); // shavite_big_init(cc, IV512); diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h index 127b54f6..ed06ca69 100644 --- a/algo/shavite/sph_shavite.h +++ b/algo/shavite/sph_shavite.h @@ -77,9 +77,9 @@ extern "C"{ */ typedef struct { #ifndef DOXYGEN_IGNORE - unsigned char buf[64]; /* first field, for alignment */ + unsigned char buf[64] __attribute__ ((aligned (64))); + sph_u32 h[8] __attribute__ ((aligned (32))); size_t ptr; - sph_u32 h[8]; sph_u32 count0, count1; #endif } sph_shavite_small_context; @@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context; */ typedef struct { #ifndef DOXYGEN_IGNORE - unsigned char buf[128]; /* first field, for alignment */ + unsigned char buf[128] __attribute__ ((aligned (64))); + sph_u32 h[16] __attribute__ ((aligned (32)));; size_t ptr; - sph_u32 h[16]; sph_u32 count0, count1, count2, count3; #endif } sph_shavite_big_context; @@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst); void sph_shavite384_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); -/** - * Initialize a SHAvite-512 context. This process performs no memory allocation. - * - * @param cc the SHAvite-512 context (pointer to a - * sph_shavite512_context) - */ -void sph_shavite512_init(void *cc); +// Always define sw but only define aesni when available +// Define fptrs for aesni or sw, not both. +void sph_shavite512_sw_init(void *cc); +void sph_shavite512_sw(void *cc, const void *data, size_t len); +void sph_shavite512_sw_close(void *cc, void *dst); +void sph_shavite512_sw_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the SHAvite-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -void sph_shavite512(void *cc, const void *data, size_t len); +#ifdef __AES__ +void sph_shavite512_aesni_init(void *cc); +void sph_shavite512_aesni(void *cc, const void *data, size_t len); +void sph_shavite512_aesni_close(void *cc, void *dst); +void sph_shavite512_aesni_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); -/** - * Terminate the current SHAvite-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the SHAvite-512 context - * @param dst the destination buffer - */ -void sph_shavite512_close(void *cc, void *dst); +#define sph_shavite512_init sph_shavite512_aesni_init +#define sph_shavite512 sph_shavite512_aesni +#define sph_shavite512_close sph_shavite512_aesni_close +#define sph_shavite512_addbits_and_close \ + sph_shavite512_aesni_addbits_and_close + +#else + +#define sph_shavite512_init sph_shavite512_sw_init +#define sph_shavite512 sph_shavite512_sw +#define sph_shavite512_close sph_shavite512_sw_close +#define sph_shavite512_addbits_and_close \ + sph_shavite512_sw_addbits_and_close + +#endif -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the SHAvite-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -void sph_shavite512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - #ifdef __cplusplus } #endif diff --git a/algo/veltor.c b/algo/veltor.c index cc120ff8..3406ecb4 100644 --- a/algo/veltor.c +++ b/algo/veltor.c @@ -104,7 +104,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t bool register_veltor_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT; + gate->optimizations = SSE2_OPT | AES_OPT; init_veltor_ctx(); gate->scanhash = (void*)&scanhash_veltor; gate->hash = (void*)&veltorhash; diff --git a/algo/whirlpool/md_helper.c b/algo/whirlpool/md_helper.c index 1139d559..a9f11db6 100644 --- a/algo/whirlpool/md_helper.c +++ b/algo/whirlpool/md_helper.c @@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc, current = (unsigned)sc->count_low & (SPH_BLEN - 1U); #endif -uint64_t *b= (uint64_t*)sc->buf; -uint64_t *s= (uint64_t*)sc->state; +//uint64_t *b= (uint64_t*)sc->buf; +//uint64_t *s= (uint64_t*)sc->state; // printf("Sptr 1= %u\n",current); // printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] ); // printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] ); diff --git a/algo/whirlpool/whirlpool-4way.c b/algo/whirlpool/whirlpool-4way.c index 4948302f..5d0f966d 100644 --- a/algo/whirlpool/whirlpool-4way.c +++ b/algo/whirlpool/whirlpool-4way.c @@ -1,4 +1,7 @@ #include "whirlpool-gate.h" + +#if defined(__AVX2__) + #include #include #include @@ -6,8 +9,6 @@ #include "sph_whirlpool.h" #include "whirlpool-hash-4way.h" -#if defined(__AVX2__) - static __thread whirlpool_4way_context whirl_mid; void whirlpool_hash_4way( void *state, const void *input ) @@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input ) } int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce, - unsigned long *hashes_done ) + uint64_t *hashes_done ) { uint32_t hash[4*8] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); @@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce, uint32_t *noncep2 = vdata + 77; uint32_t *noncep3 = vdata + 79; -// if (opt_benchmark) -// ((uint32_t*)ptarget)[7] = 0x0000ff; + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; for (int i=0; i < 19; i++) be32enc(&endiandata[i], pdata[i]); diff --git a/algo/whirlpool/whirlpool-gate.c b/algo/whirlpool/whirlpool-gate.c index d1c5d3d4..847adbc0 100644 --- a/algo/whirlpool/whirlpool-gate.c +++ b/algo/whirlpool/whirlpool-gate.c @@ -2,14 +2,15 @@ bool register_whirlpool_algo( algo_gate_t* gate ) { -//#if defined (WHIRLPOOL_4WAY) -// gate->scanhash = (void*)&scanhash_whirlpool_4way; -// gate->hash = (void*)&whirlpool_hash_4way; -//#else +#if defined (WHIRLPOOL_4WAY) + four_way_not_tested(); + gate->scanhash = (void*)&scanhash_whirlpool_4way; + gate->hash = (void*)&whirlpool_hash_4way; +#else gate->scanhash = (void*)&scanhash_whirlpool; gate->hash = (void*)&whirlpool_hash; init_whirlpool_ctx(); -//#endif +#endif return true; }; diff --git a/algo/whirlpool/whirlpool-gate.h b/algo/whirlpool/whirlpool-gate.h index 2d79377d..9fab221e 100644 --- a/algo/whirlpool/whirlpool-gate.h +++ b/algo/whirlpool/whirlpool-gate.h @@ -8,13 +8,13 @@ #define WHIRLPOOL_4WAY #endif -//#if defined (WHIRLPOOL_4WAY) +#if defined (WHIRLPOOL_4WAY) -//void whirlpool_hash_4way(void *state, const void *input); +void whirlpool_hash_4way(void *state, const void *input); -//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce, -// uint64_t *hashes_done ); -//#endif +int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); +#else void whirlpool_hash( void *state, const void *input ); @@ -22,3 +22,4 @@ int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done ); #endif +#endif diff --git a/algo/whirlpool/whirlpool-hash-4way.c b/algo/whirlpool/whirlpool-hash-4way.c index 530eed0a..9806894c 100644 --- a/algo/whirlpool/whirlpool-hash-4way.c +++ b/algo/whirlpool/whirlpool-hash-4way.c @@ -3365,6 +3365,8 @@ do { \ // scalar array of constants "table" and return referenced 64 bit entry. #define t_lane( table, inv, row, lane ) \ table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ] +// table[ t_rwo( inv, row )[ lane ] ]; + // Build a vector from elements of non-contiguous 64 bit data extracted from // scalar "table". diff --git a/algo/x11/x11-gate.c b/algo/x11/x11-gate.c new file mode 100644 index 00000000..97a7527c --- /dev/null +++ b/algo/x11/x11-gate.c @@ -0,0 +1,18 @@ +#include "x11-gate.h" + +bool register_x11_algo( algo_gate_t* gate ) +{ +#if defined (X11_4WAY) + init_x11_4way_ctx(); + gate->scanhash = (void*)&scanhash_x11_4way; + gate->hash = (void*)&x11_hash_4way; +#else + init_x11_ctx(); + gate->scanhash = (void*)&scanhash_x11; + gate->hash = (void*)&x11_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; + gate->get_max64 = (void*)&get_max64_0x3ffff; + return true; +}; + diff --git a/algo/x11/x11-gate.h b/algo/x11/x11-gate.h new file mode 100644 index 00000000..494ef2ce --- /dev/null +++ b/algo/x11/x11-gate.h @@ -0,0 +1,30 @@ +#ifndef X11_GATE_H__ +#define X11_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +//#if defined(HASH_4WAY) && !defined(NO_AES_NI) +// #define X11_4WAY +//#endif + +bool register_x11_algo( algo_gate_t* gate ); + +#if defined(X11_4WAY) + +void x11_hash_4way( void *state, const void *input ); + +int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +#endif + +void x11_hash( void *state, const void *input ); + +int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_x11_ctx(); + +#endif + diff --git a/algo/x11/x11.c b/algo/x11/x11.c index 42f18afe..ec511ca8 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -1,5 +1,5 @@ #include "cpuminer-config.h" -#include "algo-gate-api.h" +#include "x11-gate.h" #include #include @@ -61,7 +61,7 @@ void init_x11_ctx() #endif } -static void x11_hash( void *state, const void *input ) +void x11_hash( void *state, const void *input ) { unsigned char hash[128] __attribute__ ((aligned (32))); unsigned char hashbuf[128] __attribute__ ((aligned (16))); @@ -189,7 +189,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce, pdata[19] = n; return 0; } - +/* bool register_x11_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; @@ -199,4 +199,4 @@ bool register_x11_algo( algo_gate_t* gate ) gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; - +*/ diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c index e3d9b76e..0eb36a79 100644 --- a/algo/yescrypt/yescrypt.c +++ b/algo/yescrypt/yescrypt.c @@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate ) bool register_yescryptr16_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT; + gate->optimizations = SSE2_OPT | AVX_OPT; gate->scanhash = (void*)&scanhash_yescrypt; gate->hash = (void*)&yescrypt_hash; gate->set_target = (void*)&scrypt_set_target; diff --git a/avxdefs.h b/avxdefs.h index 529e4aa8..e531568c 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -26,14 +26,6 @@ #define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a ) -// Blend 128 bit vectors based on vector mask. Bits are copied from arg a0 -// if corresponding mask bits are clear and from arg a1 if set. -// Should be faster than maskload. -// isn't working. -#define mm_merge( a0, a1, mask ) \ - _mm_or_si128( _mm_and_si128( a0, mm_not( mask ) ), \ - _mm_and_si128( a1, mask ) ) - // Memory functions // n = number of __m128i, bytes/16 @@ -59,7 +51,6 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n ) dst[i] = src[i]; } - // Pointer cast // p = any aligned pointer @@ -80,47 +71,56 @@ inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n ) #define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \ _mm_slli_epi64( w, 64-c ) ) +#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \ + _mm_srli_epi64( w, 64-c ) ) + #define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \ _mm_slli_epi32( w, 32-c ) ) -// Rotate vector elements +#define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \ + _mm_srli_epi32( w, 32-c ) ) + +// Rotate elements in vector // Swap upper and lower 64 bits of 128 bit source vector -// __m128i mm128_swap64( __m128 ) #define mm_swap_64(s) _mm_shuffle_epi32( s, 0x4e ) +// Rotate 128 vector by 1 32 bit element. +#define mm_rotr_1x32( w ) _mm_shuffle_epi32( w, 0x39 ) + +#define mm_rotl_1x32( w ) _mm_shuffle_epi32( w, 0x93 ) + // Rotate 256 bits through two 128 bit vectors -// Swap 128 bit source vectors +// Swap 128 bit source vectors in place. // void mm128_swap128( __m128i, __m128i ) -// macro is better to update two args #define mm_swap_128(s0, s1) s0 = _mm_xor_si128(s0, s1); \ s1 = _mm_xor_si128(s0, s1); \ s0 = _mm_xor_si128(s0, s1); -// Rotate two 128 bit vectors as one 256 vector by 1 element -#define mm_rotl256_1x64x( s0, s1 ) \ +// Rotate two 128 bit vectors in place as one 256 vector by 1 element +#define mm_rotl256_1x64( s0, s1 ) \ do { \ - __m128i t; \ - s0 = mm_swap_64( s0 ); \ - s1 = mm_swap_64( s1 ); \ - t = mm_merge( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull ) );\ - s1 = mm_merge( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull ) ); \ - s0 = t; \ + __m128i t; \ + s0 = mm_swap_64( s0 ); \ + s1 = mm_swap_64( s1 ); \ + t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \ + s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \ + s0 = t; \ } while(0) -#define mm_rotr256_1x64x( s0, s1 ) \ +#define mm_rotr256_1x64( s0, s1 ) \ do { \ - __m128i t; \ - s0 = mm_swap_64( s0 ); \ - s1 = mm_swap_64( s1 ); \ - t = mm_merge( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull ) );\ - s1 = mm_merge( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull ) ); \ - s0 = t; \ + __m128i t; \ + s0 = mm_swap_64( s0 ); \ + s1 = mm_swap_64( s1 ); \ + t = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \ + s1 = _mm_blendv_epi8( s0, s1, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \ + s0 = t; \ } while(0) - -#define mm_rotl256_1x64( s0, s1 ) \ +// Older slower +#define mm_rotl256_1x64x( s0, s1 ) \ do { \ __m128i t; \ s0 = mm_swap_64( s0 ); \ @@ -134,10 +134,10 @@ do { \ s0 = t; \ } while(0) -#define mm_rotr256_1x64( s0, s1 ) \ +#define mm_rotr256_1x64x( s0, s1 ) \ do { \ __m128i t; \ - s0 = mm_swap_64( s0) ; \ + s0 = mm_swap_64( s0 ) ; \ s1 = mm_swap_64( s1 ); \ t = _mm_or_si128( \ _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ @@ -148,6 +148,21 @@ do { \ s0 = t; \ } while(0) +// Rotate 256 bits through two 128 bit vectors by n*32 bits and return +// the rotated s0. +// Similar to mm_rotr256_1x32 but only a partial rotation as s1 is not +// completed. It's faster than a full rotation. +inline __m128i mm_rotr256_32( __m128i s0, __m128i s1, int n ) +{ + return _mm_or_si128( _mm_srli_si128( s0, n<<2 ), + _mm_slli_si128( s1, 16 - (n<<2) ) ); +} + +inline __m128i mm_rotl256_32( __m128i s0, __m128i s1, int n ) +{ + return _mm_or_si128( _mm_slli_si128( s0, n<<2 ), + _mm_srli_si128( s1, 16 - (n<<2) ) ); +} // Swap bytes in vector elements inline __m128i mm_byteswap_32( __m128i x ) @@ -161,6 +176,21 @@ inline __m128i mm_byteswap_32( __m128i x ) return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) ); } +inline __m128i mm_byteswap_64( __m128i x ) +{ + x = _mm_or_si128( _mm_srli_epi64( x, 32 ), _mm_slli_epi64( x, 32 )); + + x = _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x, + _mm_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ), + _mm_slli_epi64( _mm_and_si128( x, + _mm_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 )); + + return _mm_or_si128( _mm_srli_epi64( _mm_and_si128( x, + _mm_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ), + _mm_slli_epi64( _mm_and_si128( x, + _mm_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 )); +} + #if defined (__AVX2__) @@ -180,13 +210,6 @@ inline __m128i mm_byteswap_32( __m128i x ) #define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a ) -// Blend 256 bit vectors based on vector mask. Bits are copied from arg a0 -// if corresponding mask bits are clear and from arg a1 if set. -// Should be faster than maskload. -#define mm256_merge( a0, a1, mask ) \ - _mm256_or_si256( _mm256_and_si256( a0, mm256_not( mask ) ), \ - _mm256_and_si256( a1, mask ) - // Pack/Unpack two 128 bit vectors into/from one 256 bit vector // usefulness tbd #define mm256_pack_2x128( hi, lo ) \ @@ -198,7 +221,7 @@ inline __m128i mm_byteswap_32( __m128i x ) // Memory functions -// n = number of __m256i (32 bytes) +// n = number of 256 bit (32 byte) vectors inline void memset_zero_256( __m256i *dst, int n ) { @@ -231,7 +254,7 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) // Rotate bits in vector elements -// Rotate bits in 4 uint64 (3 instructions) +// Rotate bits in 64 bit elements // w = packed 64 bit data, n= number of bits to rotate #define mm256_rotr_64( w, c ) \ _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) ) @@ -239,19 +262,30 @@ inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) #define mm256_rotl_64( w, c ) \ _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64 - c) ) -// Rotate vector elements +// Rotate bits in 32 bit elements +#define mm256_rotr_32( w, c ) \ + _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32 - c) ) + +#define mm256_rotl_32( w, c ) \ + _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32 - c) ) -// Rotate 256 bits by 64 bits (4 uint64 by one uint64) +// Rotate elements in vector + +// Rotate vector by one 64 bit element (aka two 32 bit elements) //__m256i mm256_rotl256_1x64( _mm256i, int ) -#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 ) +#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 ) -#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 ) +#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 ) -// Same as 2x64 rotate in either direction +// Swap 128 bit elements (aka rotate by two 64 bit, four 32 bit elements)) #define mm256_swap_128( w ) _mm256_permute2f128_si256( w, w, 1 ) -// Swap bytes in vector elements +// Rotate by one 32 bit element (aka two 16 bit elements) +#define mm256_rotl256_1x32( w ) _mm256_shuffle_epi32( w, 0x93 ) +#define mm256_rotr256_1x32( w ) _mm256_shuffle_epi32( w, 0x39 ) + +// Swap bytes in vector elements inline __m256i mm256_byteswap_32( __m256i x ) { __m256i x1 = _mm256_and_si256( x, _mm256_set1_epi32( 0x0000ff00 ) ); @@ -269,14 +303,14 @@ inline __m256i mm256_byteswap_64( __m256i x ) x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 )); x = _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x, - _mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ), + _mm256_set1_epi64x( 0xFFFF0000FFFF0000 ) ), 16 ), _mm256_slli_epi64( _mm256_and_si256( x, - _mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 )); + _mm256_set1_epi64x( 0x0000FFFF0000FFFF ) ), 16 )); - return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x, - _mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ), - _mm256_slli_epi64( _mm256_and_si256( x, - _mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 )); + return _mm256_or_si256( _mm256_srli_epi64( _mm256_and_si256( x, + _mm256_set1_epi64x( 0xFF00FF00FF00FF00 ) ), 8 ), + _mm256_slli_epi64( _mm256_and_si256( x, + _mm256_set1_epi64x( 0x00FF00FF00FF00FF ) ), 8 )); } // Pseudo parallel aes @@ -287,7 +321,6 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k ) mm256_unpack_2x128( hi, lo, x ); mm256_unpack_2x128( khi, klo, k ); - lo = _mm_aesenc_si128( lo, klo ); hi = _mm_aesenc_si128( hi, khi ); @@ -299,7 +332,6 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x ) __m128i hi, lo; mm256_unpack_2x128( hi, lo, x ); - lo = _mm_aesenc_si128( lo, mm_zero ); hi = _mm_aesenc_si128( hi, mm_zero ); @@ -308,32 +340,37 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x ) #endif // AVX2 -// AVX - // Paired functions for interleaving and deinterleaving data for vector // processing. // Size is specfied in bits regardless of vector size to avoid pointer // arithmetic confusion with different size vectors and be consistent with // the function's name. - -// Only 256, 512 and 640 bit length, (32, 64 & 80 bytes respectively) -// are supported. -// Buffer length is specified in bits to match the function naming format. +// +// Each function has 2 implementations, an optimized version that uses +// vector indexing and a slower version that uses pointers. The optimized +// version can only be used with 64 bit elements and only supports sizes +// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively. +// +// NOTE: Contrary to GCC documentation accessing vector elements using array +// indexes only works with 64 bit elements. +// Interleaving and deinterleaving of vectors of 32 bit elements +// must use the slower implementations that don't use vector indexing. +// // All data must be aligned to 256 bits for AVX2, or 128 bits for AVX. // Interleave source args and deinterleave destination args are not required -// to be contiguous but it's more efficient if they are. +// to be contiguous in memory but it's more efficient if they are. // Interleave source agrs may be the same actual arg repeated. -// 640 bit deinterleaving 4x64 or 8x32 using 256 bit AVX2 requires the +// 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the // destination buffers be defined with padding up to 768 bits for overrun -// space. -// Overrrun space is not needed when interleaving or when deinterleaving -// 4x32 using 128 bit AVX. -// Overrun space use is non destructive and should be ignored by the -// caller. +// space. Although overrun space use is non destructive it should not overlay +// useful data and should be ignored by the caller. + +// SSE2 AVX -// interleave 4 arrays of 32 bit elements for AVX processing +// interleave 4 arrays of 32 bit elements for 128 bit processing // bit_len must be 256, 512 or 640 bits. -inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1, +// Vector indexing doesn't work with 32 bit data. +inline void mm_interleave_4x32x( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, int bit_len ) { uint32_t *s0 = (uint32_t*)src0; @@ -346,7 +383,6 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1, d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] ); d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] ); d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] ); - d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] ); d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] ); d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] ); @@ -371,22 +407,27 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1, d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] ); } -// interleave 4 arrays of 32 bit elements for AVX processing // bit_len must be multiple of 32 -inline void mm_interleave_4x32x( uint32_t *dst, uint32_t *src0, - uint32_t *src1, uint32_t *src2, uint32_t *src3, int bit_len ) +inline void mm_interleave_4x32( void *dst, void *src0, void *src1, + void *src2, void *src3, int bit_len ) { - uint32_t *d = dst;; + uint32_t *d = (uint32_t*)dst; + uint32_t *s0 = (uint32_t*)src0; + uint32_t *s1 = (uint32_t*)src1; + uint32_t *s2 = (uint32_t*)src2; + uint32_t *s3 = (uint32_t*)src3; + for ( int i = 0; i < bit_len >> 5; i++, d += 4 ) { - *d = *(src0+i); - *(d+1) = *(src1+i); - *(d+2) = *(src2+i); - *(d+3) = *(src3+i); + *d = *(s0+i); + *(d+1) = *(s1+i); + *(d+2) = *(s2+i); + *(d+3) = *(s3+i); } } -inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, +// doesn't work with 32 bit elements +inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, int bit_len ) { uint32_t *s = (uint32_t*)src; @@ -428,17 +469,21 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, // deinterleave 4 arrays into individual buffers for scalarm processing // bit_len must be multiple of 32 -inline void mm_deinterleave_4x32x( uint32_t *dst0, uint32_t *dst1, - uint32_t *dst2,uint32_t *dst3, uint32_t *src, - int bit_len ) +inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, int bit_len ) { - uint32_t *s = src; + uint32_t *s = (uint32_t*)src; + uint32_t *d0 = (uint32_t*)dst0; + uint32_t *d1 = (uint32_t*)dst1; + uint32_t *d2 = (uint32_t*)dst2; + uint32_t *d3 = (uint32_t*)dst3; + for ( int i = 0; i < bit_len >> 5; i++, s += 4 ) { - *(dst0+i) = *s; - *(dst1+i) = *(s+1); - *(dst2+i) = *(s+2); - *(dst3+i) = *(s+3); + *(d0+i) = *s; + *(d1+i) = *(s+1); + *(d2+i) = *(s+2); + *(d3+i) = *(s+3); } } @@ -473,23 +518,27 @@ inline void mm256_interleave_4x64( void *dst, const void *src0, d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] ); } - -// interleave 4 arrays of 64 bit elements for AVX2 processing +// Slower version // bit_len must be multiple of 64 -inline void mm256_interleave_4x64x( uint64_t *dst, uint64_t *src0, - uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len ) +inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1, + void *src2, void *src3, int bit_len ) { - uint64_t *d = dst; + uint64_t *d = (uint64_t*)dst; + uint64_t *s0 = (uint64_t*)src0; + uint64_t *s1 = (uint64_t*)src1; + uint64_t *s2 = (uint64_t*)src2; + uint64_t *s3 = (uint64_t*)src3; + for ( int i = 0; i < bit_len>>6; i++, d += 4 ) { - *d = *(src0+i); - *(d+1) = *(src1+i); - *(d+2) = *(src2+i); - *(d+3) = *(src3+i); + *d = *(s0+i); + *(d+1) = *(s1+i); + *(d+2) = *(s2+i); + *(d+3) = *(s3+i); } } -// Deinterleave 4 buffers of 32 bit data from the source buffer. +// Deinterleave 4 buffers of 64 bit data from the source buffer. inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, int bit_len ) { @@ -520,25 +569,30 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2, d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] ); } - -// Deinterleave 4 arrays into indivudual 64 bit arrays for scalar processing +// Slower version // bit_len must be multiple 0f 64 -inline void mm256_deinterleave_4x64x( uint64_t *dst0, uint64_t *dst1, - uint64_t *dst2,uint64_t *dst3, uint64_t *src, int bit_len ) +inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2, + void *dst3, void *src, int bit_len ) { - uint64_t *s = src; - for ( int i = 0; i < bit_len>>6; i++, s += 4 ) + uint64_t *s = (uint64_t*)src; + uint64_t *d0 = (uint64_t*)dst0; + uint64_t *d1 = (uint64_t*)dst1; + uint64_t *d2 = (uint64_t*)dst2; + uint64_t *d3 = (uint64_t*)dst3; + + for ( int i = 0; i < bit_len>>6; i++, s += 4 ) { - *(dst0+i) = *s; - *(dst1+i) = *(s+1); - *(dst2+i) = *(s+2); - *(dst3+i) = *(s+3); + *(d0+i) = *s; + *(d1+i) = *(s+1); + *(d2+i) = *(s+2); + *(d3+i) = *(s+3); } } // Interleave 8 source buffers containing 32 bit data into the destination -// buffer -inline void mm256_interleave_8x32( void *dst, const void *src0, +// vector +// Doesn't work, vecror indexing doesn't work for 32 bit elements +inline void mm256_interleave_8x32x( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const void *src4, const void *src5, const void *src6, const void *src7, int bit_len ) { @@ -600,10 +654,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0, s3[19], s2[19], s1[19], s0[19] ); } - -// interleave 8 arrays of 32 bit elements for AVX2 processing +// Slower but it works with 32 bit data // bit_len must be multiple of 32 -inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0, +inline void mm256_interleave_8x32( uint32_t *dst, uint32_t *src0, uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4, uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len ) { @@ -622,7 +675,7 @@ inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0, } // Deinterleave 8 buffers of 32 bit data from the source buffer. -inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2, +inline void mm256_deinterleave_8x32x( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, const void *src, int bit_len ) { @@ -703,10 +756,9 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2, s[159], s[151], s[143], s[135] ); } - // Deinterleave 8 arrays into indivdual buffers for scalar processing // bit_len must be multiple of 32 -inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1, +inline void mm256_deinterleave_8x32( uint32_t *dst0, uint32_t *dst1, uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5, uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len ) { @@ -763,4 +815,4 @@ inline void mm_reinterleave_4x32( uint32_t *dst, uint64_t *src, } #endif // __AVX2__ -#endif // AVX_DEF_H__ +#endif // AVXDEFS_H__ diff --git a/configure b/configure index a9d7f1ce..eb3f86a3 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.5. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.6. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.7.5' -PACKAGE_STRING='cpuminer-opt 3.7.5' +PACKAGE_VERSION='3.7.6' +PACKAGE_STRING='cpuminer-opt 3.7.6' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.7.5 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.7.6 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1392,7 +1392,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.7.5:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.7.6:";; esac cat <<\_ACEOF @@ -1497,7 +1497,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.7.5 +cpuminer-opt configure 3.7.6 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.7.5, which was +It was created by cpuminer-opt $as_me 3.7.6, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2981,7 +2981,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.7.5' + VERSION='3.7.6' cat >>confdefs.h <<_ACEOF @@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.7.5, which was +This file was extended by cpuminer-opt $as_me 3.7.6, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6743,7 +6743,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.7.5 +cpuminer-opt config.status 3.7.6 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 7b64b757..aaadbee4 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.7.5]) +AC_INIT([cpuminer-opt], [3.7.6]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 4383f205..ab10b29d 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -103,7 +103,7 @@ enum algos opt_algo = ALGO_NULL; int opt_scrypt_n = 0; int opt_pluck_n = 128; int opt_n_threads = 0; -int64_t opt_affinity = -1L; +int64_t opt_affinity = -1; int opt_priority = 0; int num_cpus; char *rpc_url = NULL;; @@ -195,20 +195,27 @@ static inline void drop_policy(void) #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */ #endif -static void affine_to_cpu_mask(int id, unsigned long mask) { - cpu_set_t set; - CPU_ZERO(&set); - for (uint8_t i = 0; i < num_cpus; i++) { - // cpu mask - if (mask & (1UL< 256) ? 256 : num_cpus; + + for ( uint8_t i = 0; i < ncpus; i++ ) + { + // cpu mask + if( (ncpus > 64) || ( mask & (1UL << i) ) ) CPU_SET( i, &set ); + } + if ( id == -1 ) + { + // process affinity + sched_setaffinity(0, sizeof(&set), &set); + } + else + { + // thread only + pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set); + } } #elif defined(WIN32) /* Windows */ @@ -1669,21 +1676,30 @@ static void *miner_thread( void *userdata ) drop_policy(); } // CPU thread affinity - if (num_cpus > 1) + if ( num_cpus > 64 ) { - if (opt_affinity == -1 && opt_n_threads > 1) + // opt_affinity ignored with more than 64 cpus. + if (opt_debug) + applog( LOG_DEBUG, "Binding thread %d to cpu %d", + thr_id, thr_id % num_cpus ); + affine_to_cpu_mask( thr_id, -1 ); + } + else if ( num_cpus > 1 ) + { + if ( (opt_affinity == -1) && (opt_n_threads) > 1 ) { if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", + applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id, thr_id % num_cpus, ( 1 << (thr_id % num_cpus) ) ); - affine_to_cpu_mask(thr_id, 1UL << (thr_id % num_cpus)); + + affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) ); } - else if (opt_affinity != -1L) + else if (opt_affinity != -1) { - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to cpu mask %x", + if (opt_debug) + applog( LOG_DEBUG, "Binding thread %d to cpu mask %x", thr_id, opt_affinity); - affine_to_cpu_mask(thr_id, (unsigned long)opt_affinity); + affine_to_cpu_mask( thr_id, opt_affinity ); } } @@ -1822,10 +1838,10 @@ static void *miner_thread( void *userdata ) num_submitted++; } #if FOUR_WAY -if (num_submitted>1) - applog(LOG_NOTICE, "4 WAY hash, %u nonces submitted," CL_MAG " BONUS!" CL_WHT, num_submitted); +if (num_submitted > 1) + applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u" CL_MAG " BONUS!" CL_N, num_submitted); else - applog(LOG_NOTICE, "4 WAY hash %u nonce submitted", num_submitted); + applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u", num_submitted); #endif // must be a one way algo, nonce is already in work data if ( !num_submitted ) @@ -1836,7 +1852,7 @@ else break; } #if FOUR_WAY -applog(LOG_NOTICE, "1 WAY hash 1 nonce submitted"); +applog(LOG_NOTICE, "1 WAY hash nonce submitted"); #endif } @@ -2948,12 +2964,16 @@ bool check_cpu_capability () printf(".\nAlgo features:"); - if ( algo_has_sse2 ) printf( " SSE2" ); - if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_avx ) printf( " AVX" ); - if ( algo_has_avx2 ) printf( " AVX2" ); - if ( algo_has_4way ) printf( " 4WAY" ); - if ( algo_has_sha ) printf( " SHA" ); + if ( algo_features == EMPTY_SET ) printf( " None" ); + else + { + if ( algo_has_sse2 ) printf( " SSE2" ); + if ( algo_has_aes ) printf( " AES" ); + if ( algo_has_avx ) printf( " AVX" ); + if ( algo_has_avx2 ) printf( " AVX2" ); + if ( algo_has_4way ) printf( " 4WAY" ); + if ( algo_has_sha ) printf( " SHA" ); + } printf(".\n"); // Check for CPU and build incompatibilities @@ -3166,13 +3186,22 @@ int main(int argc, char *argv[]) SetPriorityClass(GetCurrentProcess(), prio); } #endif - if (opt_affinity != -1) - { - if (!opt_quiet) - applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity); - affine_to_cpu_mask(-1, (unsigned long)opt_affinity); - } + if ( opt_affinity != -1 ) + { + if ( num_cpus > 64 ) + { + applog(LOG_WARNING,"--cpu-affinity argument is not supported with more"); + applog(LOG_WARNING," than 64 CPUs, using default affinity."); + opt_affinity = -1; + } + else + { + if (!opt_quiet) + applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity); + affine_to_cpu_mask( -1, (unsigned long)opt_affinity ); + } + } //#ifdef HAVE_SYSLOG_H // if (use_syslog) diff --git a/miner.h b/miner.h index 53ec8350..ddf9e111 100644 --- a/miner.h +++ b/miner.h @@ -506,6 +506,7 @@ enum algos { ALGO_KECCAKC, ALGO_LBRY, ALGO_LUFFA, + ALGO_LYRA2H, ALGO_LYRA2RE, ALGO_LYRA2REV2, ALGO_LYRA2Z, @@ -576,6 +577,7 @@ static const char* const algo_names[] = { "keccakc", "lbry", "luffa", + "lyra2h", "lyra2re", "lyra2rev2", "lyra2z", @@ -700,6 +702,7 @@ Options:\n\ keccakc Creative Coin\n\ lbry LBC, LBRY Credits\n\ luffa Luffa\n\ + lyra2h Hppcoin\n\ lyra2re lyra2\n\ lyra2rev2 lyrav2, Vertcoin\n\ lyra2z Zcoin (XZC)\n\