Skip to content

Commit

Permalink
v3.21.3 Unreleased
Browse files Browse the repository at this point in the history
  • Loading branch information
JayDDee committed Mar 13, 2023
1 parent b339450 commit c6bc9d6
Show file tree
Hide file tree
Showing 49 changed files with 1,130 additions and 1,115 deletions.
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ cpuminer_SOURCES = \
algo/x16/x16r-4way.c \
algo/x16/x16rv2.c \
algo/x16/x16rv2-4way.c \
algo/x16/x16rt.c \
algo/x16/x16rt-4way.c \
algo/x16/hex.c \
algo/x16/x21s-4way.c \
algo/x16/x21s.c \
Expand Down
10 changes: 7 additions & 3 deletions RELEASE_NOTES
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ If not what makes it happen or not happen?
Change Log
----------

v3.22.3
v3.21.3.1 UNRELEASED

Revert to 3.21.2

v3.21.3 CANCELLED

#392 #379 #389 Fixed misaligned address segfault solo mining.
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
Expand All @@ -74,10 +78,10 @@ v3.22.3
Windows binaries no longer support CPU groups,
Windows binaries support CPUs with up to 64 threads.
Midstate prehash is now centralized, done only once instead of by every thread
for selected algos.
for selected algos.
Small optimizations to serialized vectoring.

v3.22.2
v3.21.2

Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
Fixed a couple of compiler warnings with gcc-12.
Expand Down
1 change: 0 additions & 1 deletion algo-gate-api.c
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,6 @@ void init_algo_gate( algo_gate_t* gate )
gate->miner_thread_init = (void*)&return_true;
gate->scanhash = (void*)&scanhash_generic;
gate->hash = (void*)&null_hash;
gate->prehash = (void*)&return_true;
gate->get_new_work = (void*)&std_get_new_work;
gate->work_decode = (void*)&std_le_work_decode;
gate->decode_extra_data = (void*)&do_nothing;
Expand Down
5 changes: 1 addition & 4 deletions algo-gate-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,14 @@ typedef struct
// to be registered with the gate.
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );

int ( *hash ) ( void*, const void*, const int );
int ( *hash ) ( void*, const void*, int );

//optional, safe to use default in most cases

// Called once by each miner thread to allocate thread local buffers and
// other initialization specific to miner threads.
bool ( *miner_thread_init ) ( int );

// Perform prehash after receiving new work
int ( *prehash ) ( struct work* );

// Get thread local copy of blockheader with unique nonce.
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* );

Expand Down
155 changes: 2 additions & 153 deletions algo/blake/blake2s-4way.c
Original file line number Diff line number Diff line change
@@ -1,50 +1,12 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
//#include "sph-blake2s.h"
#include <string.h>
#include <stdint.h>

#if defined(BLAKE2S_16WAY)

static __thread blake2s_16way_state blake2s_16w_ctx;

/*
static blake2s_16way_state blake2s_16w_ctx;
static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
*/
/*
int blake2s_16way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
return 1;
}
*/
/*
int blake2s_16way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
return 1;
}
*/

void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
Expand All @@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;

/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
*/

mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );


do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
Expand Down Expand Up @@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,

static __thread blake2s_8way_state blake2s_8w_ctx;

/*
static blake2s_8way_state blake2s_8w_ctx;
static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
int blake2s_8way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
}
*/

void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
Expand All @@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;

/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
*/

mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );


do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
Expand Down Expand Up @@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
#elif defined(BLAKE2S_4WAY)

static __thread blake2s_4way_state blake2s_4w_ctx;
/*
static blake2s_4way_state blake2s_4w_ctx;
static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));
int blake2s_4way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );

blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
blake2s_4w_ctx.t[0] = 64;
}
*/
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
Expand All @@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
*/

mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;
Expand Down
4 changes: 1 addition & 3 deletions algo/blake/blake2s-gate.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
// gate->prehash = (void*)&blake2s_16way_prehash;
#elif defined(BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
// gate->prehash = (void*)&blake2s_8way_prehash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
// gate->prehash = (void*)&blake2s_4way_prehash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
Expand Down
4 changes: 0 additions & 4 deletions algo/blake/blake2s-gate.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
void blake2s_16way_hash( void *state, const void *input );
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_16way_prehash( struct work * );

#elif defined (BLAKE2S_8WAY)

void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_8way_prehash( struct work * );

#elif defined (BLAKE2S_4WAY)

void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_4way_prehash( struct work * );

#else

void blake2s_hash( void *state, const void *input );
Expand Down
18 changes: 9 additions & 9 deletions algo/blake/blake2s-hash-4way.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )

#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
Expand All @@ -120,7 +120,7 @@ do { \

#define ROUND4W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
Expand Down Expand Up @@ -317,8 +317,8 @@ do { \

#define G8W( sigma0, sigma1, a, b, c, d) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
Expand All @@ -331,7 +331,7 @@ do { \

#define ROUND8W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
Expand Down Expand Up @@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )

#define G16W( sigma0, sigma1, a, b, c, d) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi32( c, d ); \
Expand All @@ -543,7 +543,7 @@ do { \

#define ROUND16W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
Expand Down
Loading

0 comments on commit c6bc9d6

Please sign in to comment.