v3.18.2

JayDDee · Oct 20, 2021 · 1a234cb · 1a234cb
1 parent 47cc5dc
commit 1a234cb
Show file tree

Hide file tree

Showing 18 changed files with 475 additions and 190 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -171,6 +171,7 @@ cpuminer_SOURCES = \
   algo/sha/hmac-sha256-hash-4way.c \
   algo/sha/sha256d.c \
   algo/sha/sha2.c \
+  algo/sha/sha256d-4way.c \
   algo/sha/sha256t-gate.c \
   algo/sha/sha256t-4way.c \
   algo/sha/sha256t.c \

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.8.2
+
+Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
+
+AVX512 for sha256d.
+
+SSE42 and AVX may now be displayed as mining features at startup.
+This is hard coded for each algo, and is only implemented for scrypt
+at this time as it is the only algo with significant performance differences
+with those features.
+
+Fixed an issue where a high hashrate algo could cause excessive invalid hash
+rate log reports when starting up in benchmark mode.
+
 v3.18.1
 
 More speed for scrypt:

diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
@@ -337,42 +337,42 @@ do{ \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA1 ); \
+   XA1 = ROL_1X32( XA1 ); \
    TB = ADD32( XB2, XB1 ); \
    TC = ADD32( XC2, XC1 ); \
-   TA = ROL32( TA, 13 ); \
-   XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
-   XC1 = ROL_1X32( XC1 ); \
+   TA = ROL32( TA, 13 ); \
    XA3 = XOR( XA3, TA ); \
+   XC1 = ROL_1X32( XC1 ); \
    TB = ROL32( TB, 13 ); \
    XB3 = XOR( XB3, TB ); \
    TC = ROL32( TC, 13 ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA2 ); \
+   XA2 = SWAP_64( XA2 ); \
    TB = ADD32( XB3, XB2 ); \
    TC = ADD32( XC3, XC2 ); \
    TA = ROL32( TA, 18 ); \
-   XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
-   XC2 = SWAP_64( XC2 ); \
    XA0 = XOR( XA0, TA ); \
    TB = ROL32( TB, 18 ); \
    XB0 = XOR( XB0, TB ); \
+   XC2 = SWAP_64( XC2 ); \
    TC = ROL32( TC, 18 ); \
    XC0 = XOR( XC0, TC ); \
 \
    TA = ADD32( XA0, XA1 ); \
+   XA3 = ROR_1X32( XA3 ); \
    TB = ADD32( XB0, XB1 ); \
    TC = ADD32( XC0, XC1 ); \
    TA = ROL32( TA, 7 ); \
-   XA3 = ROR_1X32( XA3 ); \
+   XB3 = ROR_1X32( XB3 ); \
    XA3 = XOR( XA3, TA ); \
    TB = ROL32( TB, 7 ); \
-   XB3 = ROR_1X32( XB3 ); \
+   XC3 = ROR_1X32( XC3 ); \
    XB3 = XOR( XB3, TB ); \
    TC = ROL32( TC, 7 ); \
-   XC3 = ROR_1X32( XC3 ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA0 ); \
@@ -399,24 +399,24 @@ do{ \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA2 ); \
+   XA2 = SWAP_64( XA2 ); \
    TB = ADD32( XB1, XB2 ); \
+   XB2 = SWAP_64( XB2 ); \
    TA = ROL32( TA, 18); \
    TC = ADD32( XC1, XC2 ); \
-   XA2 = SWAP_64( XA2 ); \
+   XC2 = SWAP_64( XC2 ); \
    TB = ROL32( TB, 18); \
    XA0 = XOR( XA0, TA ); \
-   XB2 = SWAP_64( XB2 ); \
+   XA1 = ROR_1X32( XA1 ); \
    TC = ROL32( TC, 18); \
    XB0 = XOR( XB0, TB ); \
-   XC2 = SWAP_64( XC2 ); \
-   XA1 = ROR_1X32( XA1 ); \
    XB1 = ROR_1X32( XB1 ); \
    XC0 = XOR( XC0, TC ); \
    XC1 = ROR_1X32( XC1 ); \
 } while (0);
 
 
-// slow rol, an attempt to optimze non-avx512 bit rotations
+// slow rot, an attempt to optimze non-avx512 bit rotations
 // Contains target specific instructions, only for use with 128 bit vectors
 #define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
 do{ \