Skip to content

Commit

Permalink
v23.6
Browse files Browse the repository at this point in the history
  • Loading branch information
JayDDee committed Oct 28, 2023
1 parent 160608c commit 46dca7a
Show file tree
Hide file tree
Showing 20 changed files with 3,130 additions and 2,335 deletions.
7 changes: 7 additions & 0 deletions RELEASE_NOTES
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ If not what makes it happen or not happen?
Change Log
----------

v23.6

ARM: Sha256dt, Sha256t, Sha256d 4-way now working and fully optimized for NEON, SHA also enabled but untested.
x86: Sha256dt, Sha256t, Sha256d faster SSE2 4-way.
ARM: Scrypt, Scryptn2 fully optimized for NEON, SHA also enabled but untested.
Linux: added a log when miner is started as root to discourage doing so.

v23.5

New version numbering drops the leading 3, the major version will now be the calendar year, the minor version identifies planned releases during the year.
Expand Down
4 changes: 2 additions & 2 deletions algo/argon2d/argon2d/opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,10 @@ static void fill_block( __m256i *state, const block *ref_block,

#else // SSE2

static void fill_block( v128_t *state, const block *ref_block,
static void fill_block( v128u64_t *state, const block *ref_block,
block *next_block, int with_xor )
{
v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
v128u64_t block_XY[ARGON2_OWORDS_IN_BLOCK];
unsigned int i;

if ( with_xor )
Expand Down
82 changes: 36 additions & 46 deletions algo/argon2d/blake2/blamka-round-opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,56 +23,46 @@

#if !defined(__AVX512F__)


#if !defined(__AVX2__)


static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
const v128_t z = v128_mulw32(x, y);
return v128_add64(v128_add64(x, y), v128_add64(z, z));
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
{
const v128u64_t z = v128_mulw32( x, y );
return (v128u32_t)v128_add64( v128_add64( (v128u64_t)x, (v128u64_t)y ),
v128_add64( z, z ) );
}

#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = v128_xor(D0, A0); \
D1 = v128_xor(D1, A1); \
\
D0 = v128_ror64(D0, 32); \
D1 = v128_ror64(D1, 32); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = v128_xor(B0, C0); \
B1 = v128_xor(B1, C1); \
\
B0 = v128_ror64(B0, 24); \
B1 = v128_ror64(B1, 24); \
} while ((void)0, 0)

#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = v128_xor(D0, A0); \
D1 = v128_xor(D1, A1); \
\
D0 = v128_ror64(D0, 16); \
D1 = v128_ror64(D1, 16); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = v128_xor(B0, C0); \
B1 = v128_xor(B1, C1); \
\
B0 = v128_ror64(B0, 63); \
B1 = v128_ror64(B1, 63); \
} while ((void)0, 0)
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
A0 = fBlaMka( A0, B0 ); \
A1 = fBlaMka( A1, B1 ); \
D0 = v128_xor( D0, A0 ); \
D1 = v128_xor( D1, A1 ); \
D0 = v128_ror64( D0, 32 ); \
D1 = v128_ror64( D1, 32 ); \
C0 = fBlaMka( C0, D0 ); \
C1 = fBlaMka( C1, D1 ); \
B0 = v128_xor( B0, C0 ); \
B1 = v128_xor( B1, C1 ); \
B0 = v128_ror64( B0, 24 ); \
B1 = v128_ror64( B1, 24 ); \
}

#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
A0 = fBlaMka( A0, B0 ); \
A1 = fBlaMka( A1, B1 ); \
D0 = v128_xor( D0, A0 ); \
D1 = v128_xor( D1, A1 ); \
D0 = v128_ror64( D0, 16 ); \
D1 = v128_ror64( D1, 16 ); \
C0 = fBlaMka( C0, D0 ); \
C1 = fBlaMka( C1, D1 ); \
B0 = v128_xor( B0, C0 ); \
B1 = v128_xor( B1, C1 ); \
B0 = v128_ror64( B0, 63 ); \
B1 = v128_ror64( B1, 63 ); \
}

#if defined(__SSSE3__) || defined(__ARM_NEON)

Expand Down
95 changes: 45 additions & 50 deletions algo/scrypt/scrypt-core-4way.c
Original file line number Diff line number Diff line change
Expand Up @@ -2303,9 +2303,8 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );

#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON

/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
Expand All @@ -2326,9 +2325,10 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
XB[1] = v128_blendv( t1, t3, mask_3c );
XB[2] = v128_blendv( t2, t0, mask_f0 );
XB[3] = v128_blendv( t3, t1, mask_3c );
*/

#endif

/*
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
Expand All @@ -2348,17 +2348,16 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
XB[2] = YB2;
XA[3] = YA3;
XB[3] = YB3;

#endif
*/
}

static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
{

v128_t *XA = (v128_t*)xa;
v128_t *XB = (v128_t*)xb;

#if defined(__SSE4_1__)
#if defined(__SSE4_1__)

v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
Expand All @@ -2377,9 +2376,8 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );

#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON

/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
Expand All @@ -2389,19 +2387,21 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
XA[0] = v128_blendv( t0, t2, mask_cc );
XA[1] = v128_blendv( t1, t3, mask_cc );
XA[2] = v128_blendv( t2, t0, mask_cc );
XA[1] = v128_blendv( t2, t0, mask_cc );
XA[2] = v128_blendv( t1, t3, mask_cc );
XA[3] = v128_blendv( t3, t1, mask_cc );
t0 = v128_blendv( XB[0], XB[2], mask_f0 );
t1 = v128_blendv( XB[1], XB[3], mask_3c );
t2 = v128_blendv( XB[2], XB[0], mask_f0 );
t1 = v128_blendv( XB[2], XB[0], mask_f0 );
t2 = v128_blendv( XB[1], XB[3], mask_3c );
t3 = v128_blendv( XB[3], XB[1], mask_3c );
XB[0] = v128_blendv( t0, t2, mask_cc );
XB[1] = v128_blendv( t1, t3, mask_cc );
XB[2] = v128_blendv( t2, t0, mask_cc );
XB[1] = v128_blendv( t2, t0, mask_cc );
XB[2] = v128_blendv( t1, t3, mask_cc );
XB[3] = v128_blendv( t3, t1, mask_cc );
*/

#endif

/*
v128_ovly ya[4], za[4], yb[4], zb[4];
ya[0].m128 = XA[0];
Expand Down Expand Up @@ -2457,9 +2457,7 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
XB[2] = zb[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;


#endif
*/
}

static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
Expand Down Expand Up @@ -2611,7 +2609,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
v128_t *XB = (v128_t*)xb;
v128_t *XC = (v128_t*)xc;

#if defined(__SSE4_1__)
#if defined(__SSE4_1__)

v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
Expand All @@ -2638,9 +2636,8 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );

#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON

/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
Expand All @@ -2650,28 +2647,29 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
v128_t t2 = v128_blendv( XA[2], XA[3], mask_cc );
v128_t t3 = v128_blendv( XA[3], XA[2], mask_cc );
XA[0] = v128_blendv( t0, t2, mask_f0 );
XA[1] = v128_blendv( t1, t3, mask_3c );
XA[2] = v128_blendv( t2, t0, mask_f0 );
XA[1] = v128_blendv( t2, t0, mask_f0 );
XA[2] = v128_blendv( t1, t3, mask_3c );
XA[3] = v128_blendv( t3, t1, mask_3c );
t0 = v128_blendv( XB[0], XB[1], mask_cc );
t1 = v128_blendv( XB[1], XB[0], mask_cc );
t2 = v128_blendv( XB[2], XB[3], mask_cc );
t3 = v128_blendv( XB[3], XB[2], mask_cc );
XB[0] = v128_blendv( t0, t2, mask_f0 );
XB[1] = v128_blendv( t1, t3, mask_3c );
XB[2] = v128_blendv( t2, t0, mask_f0 );
XB[1] = v128_blendv( t2, t0, mask_f0 );
XB[2] = v128_blendv( t1, t3, mask_3c );
XB[3] = v128_blendv( t3, t1, mask_3c );
t0 = v128_blendv( XC[0], XC[1], mask_cc );
t1 = v128_blendv( XC[1], XC[0], mask_cc );
t2 = v128_blendv( XC[2], XC[3], mask_cc );
t3 = v128_blendv( XC[3], XC[2], mask_cc );
XC[0] = v128_blendv( t0, t2, mask_f0 );
XC[1] = v128_blendv( t1, t3, mask_3c );
XC[2] = v128_blendv( t2, t0, mask_f0 );
XC[1] = v128_blendv( t2, t0, mask_f0 );
XC[2] = v128_blendv( t1, t3, mask_3c );
XC[3] = v128_blendv( t3, t1, mask_3c );
*/


#endif

/*
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
Expand Down Expand Up @@ -2699,9 +2697,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
XA[3] = YA3;
XB[3] = YB3;
XC[3] = YC3;


#endif
*/
}

static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
Expand Down Expand Up @@ -2738,9 +2734,8 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
XC[3] = _mm_blend_epi16( t1, t3, 0x33 );

#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON

/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
Expand All @@ -2750,27 +2745,29 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
XA[0] = v128_blendv( t0, t2, mask_cc );
XA[1] = v128_blendv( t1, t3, mask_cc );
XA[2] = v128_blendv( t2, t0, mask_cc );
XA[1] = v128_blendv( t2, t0, mask_cc );
XA[2] = v128_blendv( t1, t3, mask_cc );
XA[3] = v128_blendv( t3, t1, mask_cc );
t0 = v128_blendv( XB[0], XB[2], mask_f0 );
t1 = v128_blendv( XB[1], XB[3], mask_3c );
t2 = v128_blendv( XB[2], XB[0], mask_f0 );
t1 = v128_blendv( XB[2], XB[0], mask_f0 );
t2 = v128_blendv( XB[1], XB[3], mask_3c );
t3 = v128_blendv( XB[3], XB[1], mask_3c );
XB[0] = v128_blendv( t0, t2, mask_cc );
XB[1] = v128_blendv( t1, t3, mask_cc );
XB[2] = v128_blendv( t2, t0, mask_cc );
XB[1] = v128_blendv( t2, t0, mask_cc );
XB[2] = v128_blendv( t1, t3, mask_cc );
XB[3] = v128_blendv( t3, t1, mask_cc );
t0 = v128_blendv( XC[0], XC[2], mask_f0 );
t1 = v128_blendv( XC[1], XC[3], mask_3c );
t2 = v128_blendv( XC[2], XC[0], mask_f0 );
t1 = v128_blendv( XC[2], XC[0], mask_f0 );
t2 = v128_blendv( XC[1], XC[3], mask_3c );
t3 = v128_blendv( XC[3], XC[1], mask_3c );
XC[0] = v128_blendv( t0, t2, mask_cc );
XC[1] = v128_blendv( t1, t3, mask_cc );
XC[2] = v128_blendv( t2, t0, mask_cc );
XC[1] = v128_blendv( t2, t0, mask_cc );
XC[2] = v128_blendv( t1, t3, mask_cc );
XC[3] = v128_blendv( t3, t1, mask_cc );
*/


#endif

/*
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
ya[0].m128 = XA[0];
Expand Down Expand Up @@ -2850,9 +2847,7 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XC[3] = zc[3].m128;


#endif
*/
}

// Triple buffered, 3x memory usage
Expand Down
Loading

0 comments on commit 46dca7a

Please sign in to comment.