v23.6

JayDDee · Oct 28, 2023 · 46dca7a · 46dca7a
1 parent 160608c
commit 46dca7a
Show file tree

Hide file tree

Showing 20 changed files with 3,130 additions and 2,335 deletions.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -73,6 +73,13 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v23.6
+
+ARM: Sha256dt, Sha256t, Sha256d 4-way now working and fully optimized for NEON, SHA also enabled but untested.
+x86: Sha256dt, Sha256t, Sha256d faster SSE2 4-way.
+ARM: Scrypt, Scryptn2 fully optimized for NEON, SHA also enabled but untested.
+Linux: added a log when miner is started as root to discourage doing so.
+
 v23.5
 
 New version numbering drops the leading 3, the major version will now be the calendar year, the minor version identifies planned releases during the year.

diff --git a/algo/argon2d/argon2d/opt.c b/algo/argon2d/argon2d/opt.c
@@ -136,10 +136,10 @@ static void fill_block( __m256i *state, const block *ref_block,
 
 #else  // SSE2
 
-static void fill_block( v128_t *state, const block *ref_block,
+static void fill_block( v128u64_t *state, const block *ref_block,
                        block *next_block, int with_xor )
 {
-    v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
+    v128u64_t block_XY[ARGON2_OWORDS_IN_BLOCK];
     unsigned int i;
 
     if ( with_xor )

diff --git a/algo/argon2d/blake2/blamka-round-opt.h b/algo/argon2d/blake2/blamka-round-opt.h
@@ -23,56 +23,46 @@
 
 #if !defined(__AVX512F__)
 
-
 #if !defined(__AVX2__)
 
-
-static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
-    const v128_t z = v128_mulw32(x, y);
-    return v128_add64(v128_add64(x, y), v128_add64(z, z));
+static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
+{
+    const v128u64_t z = v128_mulw32( x, y );
+    return (v128u32_t)v128_add64( v128_add64( (v128u64_t)x, (v128u64_t)y ),
+                                  v128_add64( z, z ) );
 }
 
-#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
-    do {                                                                       \
-        A0 = fBlaMka(A0, B0);                                                  \
-        A1 = fBlaMka(A1, B1);                                                  \
-                                                                               \
-        D0 = v128_xor(D0, A0);                                            \
-        D1 = v128_xor(D1, A1);                                            \
-                                                                               \
-        D0 = v128_ror64(D0, 32);                                          \
-        D1 = v128_ror64(D1, 32);                                          \
-                                                                               \
-        C0 = fBlaMka(C0, D0);                                                  \
-        C1 = fBlaMka(C1, D1);                                                  \
-                                                                               \
-        B0 = v128_xor(B0, C0);                                            \
-        B1 = v128_xor(B1, C1);                                            \
-                                                                               \
-        B0 = v128_ror64(B0, 24);                                          \
-        B1 = v128_ror64(B1, 24);                                          \
-    } while ((void)0, 0)
-
-#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
-    do {                                                                       \
-        A0 = fBlaMka(A0, B0);                                                  \
-        A1 = fBlaMka(A1, B1);                                                  \
-                                                                               \
-        D0 = v128_xor(D0, A0);                                            \
-        D1 = v128_xor(D1, A1);                                            \
-                                                                               \
-        D0 = v128_ror64(D0, 16);                                          \
-        D1 = v128_ror64(D1, 16);                                          \
-                                                                               \
-        C0 = fBlaMka(C0, D0);                                                  \
-        C1 = fBlaMka(C1, D1);                                                  \
-                                                                               \
-        B0 = v128_xor(B0, C0);                                            \
-        B1 = v128_xor(B1, C1);                                            \
-                                                                               \
-        B0 = v128_ror64(B0, 63);                                          \
-        B1 = v128_ror64(B1, 63);                                          \
-    } while ((void)0, 0)
+#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+   A0 = fBlaMka( A0, B0 ); \
+   A1 = fBlaMka( A1, B1 ); \
+   D0 = v128_xor( D0, A0 ); \
+   D1 = v128_xor( D1, A1 ); \
+   D0 = v128_ror64( D0, 32 ); \
+   D1 = v128_ror64( D1, 32 ); \
+   C0 = fBlaMka( C0, D0 ); \
+   C1 = fBlaMka( C1, D1 ); \
+   B0 = v128_xor( B0, C0 ); \
+   B1 = v128_xor( B1, C1 ); \
+   B0 = v128_ror64( B0, 24 ); \
+   B1 = v128_ror64( B1, 24 ); \
+} 
+
+#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+   A0 = fBlaMka( A0, B0 ); \
+   A1 = fBlaMka( A1, B1 ); \
+   D0 = v128_xor( D0, A0 ); \
+   D1 = v128_xor( D1, A1 ); \
+   D0 = v128_ror64( D0, 16 ); \
+   D1 = v128_ror64( D1, 16 ); \
+   C0 = fBlaMka( C0, D0 ); \
+   C1 = fBlaMka( C1, D1 ); \
+   B0 = v128_xor( B0, C0 ); \
+   B1 = v128_xor( B1, C1 ); \
+   B0 = v128_ror64( B0, 63 ); \
+   B1 = v128_ror64( B1, 63 ); \
+}
 
 #if defined(__SSSE3__)  || defined(__ARM_NEON)
 

diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
@@ -2303,9 +2303,8 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
   XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
   XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
 
-#elif defined(__SSE2__) ||  defined(__ARM_NEON)
+#else    // SSE2 or NEON  
 
-/*  
   const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
   const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff,                  0);
   const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2326,9 +2325,10 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
   XB[1] = v128_blendv( t1, t3, mask_3c );
   XB[2] = v128_blendv( t2, t0, mask_f0 );
   XB[3] = v128_blendv( t3, t1, mask_3c );
-*/
 
+#endif
 
+/*  
    v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
    
    YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
@@ -2348,17 +2348,16 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
    XB[2] = YB2;
    XA[3] = YA3;
    XB[3] = YB3;
-
-#endif
+*/
 }
 
 static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {
 
    v128_t *XA = (v128_t*)xa;
    v128_t *XB = (v128_t*)xb;
-
-#if defined(__SSE4_1__)
+   
+#if defined(__SSE4_1__) 
 
   v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
   v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
@@ -2377,9 +2376,8 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
   XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
   XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
 
-#elif defined(__SSE2__) || defined(__ARM_NEON)
+#else   //  SSE2 or NEON
 
-/*
   const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
   const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff,                  0);
   const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2389,19 +2387,21 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
   v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
   v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
   XA[0] = v128_blendv( t0, t2, mask_cc );
-  XA[1] = v128_blendv( t1, t3, mask_cc );
-  XA[2] = v128_blendv( t2, t0, mask_cc );
+  XA[1] = v128_blendv( t2, t0, mask_cc );
+  XA[2] = v128_blendv( t1, t3, mask_cc );
   XA[3] = v128_blendv( t3, t1, mask_cc );
   t0 = v128_blendv( XB[0], XB[2], mask_f0 );
-  t1 = v128_blendv( XB[1], XB[3], mask_3c );
-  t2 = v128_blendv( XB[2], XB[0], mask_f0 );
+  t1 = v128_blendv( XB[2], XB[0], mask_f0 );
+  t2 = v128_blendv( XB[1], XB[3], mask_3c );
   t3 = v128_blendv( XB[3], XB[1], mask_3c );
   XB[0] = v128_blendv( t0, t2, mask_cc );
-  XB[1] = v128_blendv( t1, t3, mask_cc );
-  XB[2] = v128_blendv( t2, t0, mask_cc );
+  XB[1] = v128_blendv( t2, t0, mask_cc );
+  XB[2] = v128_blendv( t1, t3, mask_cc );
   XB[3] = v128_blendv( t3, t1, mask_cc );
-*/
 
+#endif
+
+/*
    v128_ovly ya[4], za[4], yb[4], zb[4];
 
    ya[0].m128 = XA[0];
@@ -2457,9 +2457,7 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
    XB[2] = zb[2].m128;
    XA[3] = za[3].m128;
    XB[3] = zb[3].m128;
-
-
-#endif
+*/
 }
 
 static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
@@ -2611,7 +2609,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
    v128_t *XB = (v128_t*)xb;
    v128_t *XC = (v128_t*)xc;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) 
 
   v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
   v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
@@ -2638,9 +2636,8 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
   XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
   XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
 
-#elif defined(__SSE2__) ||  defined(__ARM_NEON)
+#else    // SSE2 or NEON   
 
-/*
   const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
   const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff,                  0);
   const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2650,28 +2647,29 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
   v128_t t2 = v128_blendv( XA[2], XA[3], mask_cc );
   v128_t t3 = v128_blendv( XA[3], XA[2], mask_cc );
   XA[0] = v128_blendv( t0, t2, mask_f0 );
-  XA[1] = v128_blendv( t1, t3, mask_3c );
-  XA[2] = v128_blendv( t2, t0, mask_f0 );
+  XA[1] = v128_blendv( t2, t0, mask_f0 );
+  XA[2] = v128_blendv( t1, t3, mask_3c );
   XA[3] = v128_blendv( t3, t1, mask_3c );
   t0 = v128_blendv( XB[0], XB[1], mask_cc );
   t1 = v128_blendv( XB[1], XB[0], mask_cc );
   t2 = v128_blendv( XB[2], XB[3], mask_cc );
   t3 = v128_blendv( XB[3], XB[2], mask_cc );
   XB[0] = v128_blendv( t0, t2, mask_f0 );
-  XB[1] = v128_blendv( t1, t3, mask_3c );
-  XB[2] = v128_blendv( t2, t0, mask_f0 );
+  XB[1] = v128_blendv( t2, t0, mask_f0 );
+  XB[2] = v128_blendv( t1, t3, mask_3c );
   XB[3] = v128_blendv( t3, t1, mask_3c );
   t0 = v128_blendv( XC[0], XC[1], mask_cc );
   t1 = v128_blendv( XC[1], XC[0], mask_cc );
   t2 = v128_blendv( XC[2], XC[3], mask_cc );
   t3 = v128_blendv( XC[3], XC[2], mask_cc );
   XC[0] = v128_blendv( t0, t2, mask_f0 );
-  XC[1] = v128_blendv( t1, t3, mask_3c );
-  XC[2] = v128_blendv( t2, t0, mask_f0 );
+  XC[1] = v128_blendv( t2, t0, mask_f0 );
+  XC[2] = v128_blendv( t1, t3, mask_3c );
   XC[3] = v128_blendv( t3, t1, mask_3c );
-*/
-
 
+#endif
+
+/*
    v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
 
    YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
@@ -2699,9 +2697,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
    XA[3] = YA3;
    XB[3] = YB3;
    XC[3] = YC3;
-
-
-#endif
+*/
 }
 
 static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
@@ -2738,9 +2734,8 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
   XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
   XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
 
-#elif defined(__SSE2__) || defined(__ARM_NEON)
+#else   //  SSE2 or NEON
 
-/*
   const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
   const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff,                  0);
   const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2750,27 +2745,29 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
   v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
   v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
   XA[0] = v128_blendv( t0, t2, mask_cc );
-  XA[1] = v128_blendv( t1, t3, mask_cc );
-  XA[2] = v128_blendv( t2, t0, mask_cc );
+  XA[1] = v128_blendv( t2, t0, mask_cc );
+  XA[2] = v128_blendv( t1, t3, mask_cc );
   XA[3] = v128_blendv( t3, t1, mask_cc );
   t0 = v128_blendv( XB[0], XB[2], mask_f0 );
-  t1 = v128_blendv( XB[1], XB[3], mask_3c );
-  t2 = v128_blendv( XB[2], XB[0], mask_f0 );
+  t1 = v128_blendv( XB[2], XB[0], mask_f0 );
+  t2 = v128_blendv( XB[1], XB[3], mask_3c );
   t3 = v128_blendv( XB[3], XB[1], mask_3c );
   XB[0] = v128_blendv( t0, t2, mask_cc );
-  XB[1] = v128_blendv( t1, t3, mask_cc );
-  XB[2] = v128_blendv( t2, t0, mask_cc );
+  XB[1] = v128_blendv( t2, t0, mask_cc );
+  XB[2] = v128_blendv( t1, t3, mask_cc );
   XB[3] = v128_blendv( t3, t1, mask_cc );
   t0 = v128_blendv( XC[0], XC[2], mask_f0 );
-  t1 = v128_blendv( XC[1], XC[3], mask_3c );
-  t2 = v128_blendv( XC[2], XC[0], mask_f0 );
+  t1 = v128_blendv( XC[2], XC[0], mask_f0 );
+  t2 = v128_blendv( XC[1], XC[3], mask_3c );
   t3 = v128_blendv( XC[3], XC[1], mask_3c );
   XC[0] = v128_blendv( t0, t2, mask_cc );
-  XC[1] = v128_blendv( t1, t3, mask_cc );
-  XC[2] = v128_blendv( t2, t0, mask_cc );
+  XC[1] = v128_blendv( t2, t0, mask_cc );
+  XC[2] = v128_blendv( t1, t3, mask_cc );
   XC[3] = v128_blendv( t3, t1, mask_cc );
-*/
-
+
+#endif
+
+/*  
    v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
 
    ya[0].m128 = XA[0];
@@ -2850,9 +2847,7 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
    XA[3] = za[3].m128;
    XB[3] = zb[3].m128;
    XC[3] = zc[3].m128;
-
-
-#endif   
+*/
 }   
 
 // Triple buffered, 3x memory usage