diff --git a/RELEASE_NOTES b/RELEASE_NOTES index e22c2baa..a8a5e1a8 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,19 @@ If not what makes it happen or not happen? Change Log ---------- +v3.19.2 + +Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1. + +Reduce log noise when replies to submitted shares are lost due to stratum errors. + +Fugue prehash optimization for X16r family AVX2 & AVX512. + +Small speed improvement for Hamsi AVX2 & AVX512. + +Win: With CPU groups enabled the number of CPUs displayed in the ASCII art +affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64. + v3.19.1 Changes to Windows binaries package: diff --git a/algo/fugue/fugue-aesni.h b/algo/fugue/fugue-aesni.h index d1536641..13fd8f87 100644 --- a/algo/fugue/fugue-aesni.h +++ b/algo/fugue/fugue-aesni.h @@ -37,12 +37,23 @@ typedef struct } hashState_fugue __attribute__ ((aligned (64))); + +// These functions are deprecated, use the lower case macro aliases that use +// the standard interface. This will be cleaned up at a later date. HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen); HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen); HashReturn fugue512_Final(hashState_fugue *state, void *hashval); +#define fugue512_init( state ) \ + fugue512_Init( state, 512 ) +#define fugue512_update( state, data, len ) \ + fugue512_Update( state, data, (len)<<3 ) +#define fugue512_final \ + fugue512_Final + + HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen); #endif // AES diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 26e133c9..b7b7c705 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -550,16 +550,38 @@ static const sph_u32 T512[64][16] = { // Hamsi 8 way AVX512 +// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero +// produces the same result but is faster. +#define INPUT_BIG8 \ +do { \ + __m512i db = _mm512_ror_epi64( *buf, 1 ); \ + const uint64_t *tp = (const uint64_t*)T512; \ + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ + for ( int u = 0; u < 64; u++ ) \ + { \ + __mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \ + m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \ + m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \ + m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \ + m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \ + m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \ + m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \ + m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \ + m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \ + db = _mm512_ror_epi64( db, 1 ); \ + tp += 8; \ + } \ +} while (0) + +/* #define INPUT_BIG8 \ do { \ __m512i db = *buf; \ - const uint64_t *tp = (uint64_t*)&T512[0][0]; \ + const uint64_t *tp = (const uint64_t*)T512; \ m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \ for ( int u = 0; u < 64; u++ ) \ { \ - __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \ - dm = mm512_negate_32( _mm512_or_si512( dm, \ - _mm512_slli_epi64( dm, 32 ) ) ); \ + __m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \ m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \ m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \ m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \ @@ -572,6 +594,7 @@ do { \ db = _mm512_srli_epi64( db, 1 ); \ } \ } while (0) +*/ #define SBOX8( a, b, c, d ) \ do { \ @@ -888,13 +911,11 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) #define INPUT_BIG \ do { \ __m256i db = *buf; \ - const uint64_t *tp = (uint64_t*)&T512[0][0]; \ + const uint64_t *tp = (const uint64_t*)T512; \ m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \ for ( int u = 0; u < 64; u++ ) \ { \ - __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \ - dm = mm256_negate_32( _mm256_or_si256( dm, \ - _mm256_slli_epi64( dm, 32 ) ) ); \ + __m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \ m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ m256_const1_64( tp[0] ) ) ); \ m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index 95639691..5557ca33 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -1544,7 +1544,6 @@ bool register_scrypt_algo( algo_gate_t* gate ) format_number_si( &t_size, t_units ); format_number_si( &d_size, d_units ); - applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n", SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units ); diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 8d4fb058..39efd257 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -60,7 +60,14 @@ void x16r_8way_prehash( void *vdata, void *pdata ) case HAMSI: mm512_bswap32_intrlv80_8x64( vdata, pdata ); hamsi512_8way_init( &x16r_ctx.hamsi ); - hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 ); + hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 ); + break; + case FUGUE: + mm128_bswap32_80( edata, pdata ); + fugue512_init( &x16r_ctx.fugue ); + fugue512_update( &x16r_ctx.fugue, edata, 76 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); break; case SHABAL: mm256_bswap32_intrlv80_8x32( vdata2, pdata ); @@ -306,7 +313,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) break; case HAMSI: if ( i == 0 ) - hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 ); + hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, @@ -319,14 +326,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) hash7, vhash ); break; case FUGUE: - fugue512_full( &ctx.fugue, hash0, in0, size ); - fugue512_full( &ctx.fugue, hash1, in1, size ); - fugue512_full( &ctx.fugue, hash2, in2, size ); - fugue512_full( &ctx.fugue, hash3, in3, size ); - fugue512_full( &ctx.fugue, hash4, in4, size ); - fugue512_full( &ctx.fugue, hash5, in5, size ); - fugue512_full( &ctx.fugue, hash6, in6, size ); - fugue512_full( &ctx.fugue, hash7, in7, size ); + if ( i == 0 ) + { + fugue512_update( &ctx.fugue, in0 + 76, 4 ); + fugue512_final( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in1 + 76, 4 ); + fugue512_final( &ctx.fugue, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in2 + 76, 4 ); + fugue512_final( &ctx.fugue, hash2 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in3 + 76, 4 ); + fugue512_final( &ctx.fugue, hash3 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in4 + 76, 4 ); + fugue512_final( &ctx.fugue, hash4 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in5 + 76, 4 ); + fugue512_final( &ctx.fugue, hash5 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in6 + 76, 4 ); + fugue512_final( &ctx.fugue, hash6 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in7 + 76, 4 ); + fugue512_final( &ctx.fugue, hash7 ); + } + else + { + fugue512_full( &ctx.fugue, hash0, in0, size ); + fugue512_full( &ctx.fugue, hash1, in1, size ); + fugue512_full( &ctx.fugue, hash2, in2, size ); + fugue512_full( &ctx.fugue, hash3, in3, size ); + fugue512_full( &ctx.fugue, hash4, in4, size ); + fugue512_full( &ctx.fugue, hash5, in5, size ); + fugue512_full( &ctx.fugue, hash6, in6, size ); + fugue512_full( &ctx.fugue, hash7, in7, size ); + } break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, @@ -347,25 +383,25 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid ) { sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash1 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash2 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash4 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash5 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash6 ); - memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) ); sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 ); sph_whirlpool_close( &ctx.whirlpool, hash7 ); } @@ -532,7 +568,13 @@ void x16r_4way_prehash( void *vdata, void *pdata ) case HAMSI: mm256_bswap32_intrlv80_4x64( vdata, pdata ); hamsi512_4way_init( &x16r_ctx.hamsi ); - hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 ); + hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 ); + break; + case FUGUE: + mm128_bswap32_80( edata, pdata ); + fugue512_init( &x16r_ctx.fugue ); + fugue512_update( &x16r_ctx.fugue, edata, 76 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); break; case SHABAL: mm128_bswap32_intrlv80_4x32( vdata2, pdata ); @@ -734,7 +776,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) break; case HAMSI: if ( i == 0 ) - hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); + hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); @@ -745,10 +787,27 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid ) dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: - fugue512_full( &ctx.fugue, hash0, in0, size ); - fugue512_full( &ctx.fugue, hash1, in1, size ); - fugue512_full( &ctx.fugue, hash2, in2, size ); - fugue512_full( &ctx.fugue, hash3, in3, size ); + if ( i == 0 ) + { + fugue512_update( &ctx.fugue, in0 + 76, 4 ); + fugue512_final( &ctx.fugue, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in1 + 76, 4 ); + fugue512_final( &ctx.fugue, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in2 + 76, 4 ); + fugue512_final( &ctx.fugue, hash2 ); + memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) ); + fugue512_update( &ctx.fugue, in3 + 76, 4 ); + fugue512_final( &ctx.fugue, hash3 ); + } + else + { + fugue512_full( &ctx.fugue, hash0, in0, size ); + fugue512_full( &ctx.fugue, hash1, in1, size ); + fugue512_full( &ctx.fugue, hash2, in2, size ); + fugue512_full( &ctx.fugue, hash3, in3, size ); + } break; case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); diff --git a/configure b/configure index eca6ff1f..f678bda4 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.19.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.19.1' -PACKAGE_STRING='cpuminer-opt 3.19.1' +PACKAGE_VERSION='3.19.2' +PACKAGE_STRING='cpuminer-opt 3.19.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.19.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.19.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.19.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.19.2:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.19.1 +cpuminer-opt configure 3.19.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.19.1, which was +It was created by cpuminer-opt $as_me 3.19.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.19.1' + VERSION='3.19.2' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.19.1, which was +This file was extended by cpuminer-opt $as_me 3.19.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.19.1 +cpuminer-opt config.status 3.19.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 11d4e595..314b0d5b 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.19.1]) +AC_INIT([cpuminer-opt], [3.19.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index ee31ae58..7a70f9f1 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -224,7 +224,11 @@ static uint8_t thread_affinity_map[ max_cpus ]; // display affinity mask graphically static void format_affinity_mask( char *mask_str, uint64_t mask ) { +#if defined(WINDOWS_CPU_GROUPS_ENABLED) + int n = num_cpus / num_cpugroups; +#else int n = num_cpus < 64 ? num_cpus : 64; +#endif int i; for ( i = 0; i < n; i++ ) { @@ -2164,7 +2168,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) } // !quiet } // new diff/block - if ( new_job && !opt_quiet ) + if ( new_job && !( opt_quiet || stratum_errors ) ) { int mismatch = submitted_share_count - ( accepted_share_count + stale_share_count @@ -3609,7 +3613,9 @@ int main(int argc, char *argv[]) num_cpus = 1; #endif - if ( num_cpus < 1 ) num_cpus = 1; + if ( num_cpus < 1 ) + num_cpus = 1; + opt_n_threads = num_cpus; parse_cmdline( argc, argv ); @@ -3745,9 +3751,6 @@ int main(int argc, char *argv[]) } #endif - if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) ) - opt_n_threads = num_cpus; - if ( opt_affinity && num_cpus > max_cpus ) { applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",