Skip to content

Commit

Permalink
1.1.6 changes.
Browse files Browse the repository at this point in the history
Refactor Cryptonight code. Use macros for similar parts in 1way and 2way methods.
Fix scratchpad fill and read prefetch. Further optimizations.
Benchmark should give more real world performance from data gathered from 16 days of mining.
Fix memory allocation for Large Pages.
Add Tuning functionality for AVX and non-AES code. Switch from 1way to 2way.
Disable MSR on non-AES binaries.
Add p2pool as dev fee collection alternative. Should be usable for China and Hong Kong region users.
Make tune functionality as default behaviour. --no-tune to disable it.
Prevent sending very old/stale shares after losing connection.
Remove --cn-config functionality.
  • Loading branch information
michal-zurkowski committed Jun 8, 2021
1 parent e622cf1 commit 2d85ba6
Show file tree
Hide file tree
Showing 14 changed files with 480 additions and 419 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Supported Algorithms
decred
deep Deepcoin (DCN)
dmd-gr Diamond-Groestl
gr Ghost Rider (RTM)
groestl Groestl coin
hex x16r-hex
hmq1725 Espers
Expand Down
35 changes: 20 additions & 15 deletions algo/gr/cryptonote/cryptonight.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ static void do_skein_hash(const void *input, size_t len, void *output) {
static void (*const extra_hashes[4])(const void *, size_t, void *) = {
do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};

static __attribute__((always_inline)) uint64_t
__umul128(const uint64_t *a, const uint64_t *b, uint64_t *hi) {
unsigned __int128 r = (unsigned __int128)(*a) * (unsigned __int128)(*b);
*hi = r >> 64;
return (uint64_t)r;
}

// This will shift and xor tmp1 into itself as 4 32-bit vals such as
// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
static inline __m128i sl_xor(__m128i tmp1) {
Expand Down Expand Up @@ -185,7 +178,7 @@ static inline void explode_scratchpad(const __m128i *state, __m128i *ls,
__m128i k[10];

aes_genkey(state, k);
const __m128i *restrict key = __builtin_assume_aligned(k, 16);
const __m128i *key = __builtin_assume_aligned(k, 16);

memcpy(x, state + 4, 128);

Expand Down Expand Up @@ -217,7 +210,7 @@ static inline void implode_scratchpad(const __m128i *ls, __m128i *state,
__m128i k[10];

aes_genkey(state + 2, k);
const __m128i *restrict key = __builtin_assume_aligned(k, 16);
const __m128i *key = __builtin_assume_aligned(k, 16);

memcpy(x, state + 4, 128);

Expand All @@ -229,17 +222,29 @@ static inline void implode_scratchpad(const __m128i *ls, __m128i *state,
for (i = 0; i < memory - PREFETCH_SIZE_B; i += 128) {
_mm_prefetch(ls + PREFETCH_SHIFT, PREFETCH_TYPE_R);
_mm_prefetch(ls + PREFETCH_SHIFT + WPL, PREFETCH_TYPE_R);
for (size_t j = 0; j < 8; ++j) {
x[j] = _mm_xor_si128(_mm_load_si128(ls + j), x[j]);
}
x[0] = _mm_xor_si128(ls[0], x[0]);
x[1] = _mm_xor_si128(ls[1], x[1]);
x[2] = _mm_xor_si128(ls[2], x[2]);
x[3] = _mm_xor_si128(ls[3], x[3]);
x[4] = _mm_xor_si128(ls[4], x[4]);
x[5] = _mm_xor_si128(ls[5], x[5]);
x[6] = _mm_xor_si128(ls[6], x[6]);
x[7] = _mm_xor_si128(ls[7], x[7]);
ls += WPS;

aes_batch(key, x);
}

for (; i < memory; i += 128) {
for (size_t j = 0; j < 8; ++j) {
x[j] = _mm_xor_si128(_mm_load_si128(ls + j), x[j]);
}
x[0] = _mm_xor_si128(ls[0], x[0]);
x[1] = _mm_xor_si128(ls[1], x[1]);
x[2] = _mm_xor_si128(ls[2], x[2]);
x[3] = _mm_xor_si128(ls[3], x[3]);
x[4] = _mm_xor_si128(ls[4], x[4]);
x[5] = _mm_xor_si128(ls[5], x[5]);
x[6] = _mm_xor_si128(ls[6], x[6]);
x[7] = _mm_xor_si128(ls[7], x[7]);
ls += WPS;

aes_batch(key, x);
}
Expand Down
2 changes: 1 addition & 1 deletion algo/gr/cryptonote/soft_aes.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ const uint32_t saes_table[4][256] = {saes_data(saes_u0), saes_data(saes_u1),
saes_data(saes_u2), saes_data(saes_u3)};
__attribute__((aligned(16))) const uint8_t saes_sbox[256] = saes_data(saes_h0);

static __attribute__((always_inline)) uint32_t sub_word(uint32_t key) {
static __attribute__((always_inline)) inline uint32_t sub_word(uint32_t key) {
return (saes_sbox[key >> 24] << 24) | (saes_sbox[(key >> 16) & 0xff] << 16) |
(saes_sbox[(key >> 8) & 0xff] << 8) | saes_sbox[key & 0xff];
}
Expand Down
117 changes: 27 additions & 90 deletions algo/gr/gr-4way.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@

#if defined(GR_4WAY)

#define CRYPTONIGHT_HASH(variant, way) \
if (vectorized) { \
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash); \
} \
\
if (way) { \
cryptonight_##variant##_2way_hash(hash0, hash1, hash0, hash1); \
cryptonight_##variant##_2way_hash(hash2, hash3, hash2, hash3); \
} else { \
cryptonight_##variant##_hash(hash0, hash0); \
cryptonight_##variant##_hash(hash1, hash1); \
cryptonight_##variant##_hash(hash2, hash2); \
cryptonight_##variant##_hash(hash3, hash3); \
} \
vectorized = false;

int gr_4way_hash(void *output, const void *input, int thrid) {
uint64_t vhash[10 * 4] __attribute__((aligned(128)));
uint64_t vhashA[10 * 2] __attribute__((aligned(128)));
Expand Down Expand Up @@ -267,95 +283,22 @@ int gr_4way_hash(void *output, const void *input, int thrid) {
vectorized = true;
break;
case CNTurtlelite:
if (vectorized) {
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
}
if (cn_config[Turtlelite]) {
cryptonight_turtlelite_2way_hash(hash0, hash1, hash0, hash1);
cryptonight_turtlelite_2way_hash(hash2, hash3, hash2, hash3);
} else {
cryptonight_turtlelite_hash(hash0, hash0);
cryptonight_turtlelite_hash(hash1, hash1);
cryptonight_turtlelite_hash(hash2, hash2);
cryptonight_turtlelite_hash(hash3, hash3);
}
vectorized = false;
CRYPTONIGHT_HASH(turtlelite, cn_config[Turtlelite]);
break;
case CNTurtle:
if (vectorized) {
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
}
if (cn_config[Turtle]) {
cryptonight_turtle_2way_hash(hash0, hash1, hash0, hash1);
cryptonight_turtle_2way_hash(hash2, hash3, hash2, hash3);
} else {
cryptonight_turtle_hash(hash0, hash0);
cryptonight_turtle_hash(hash1, hash1);
cryptonight_turtle_hash(hash2, hash2);
cryptonight_turtle_hash(hash3, hash3);
}
vectorized = false;
CRYPTONIGHT_HASH(turtle, cn_config[Turtle]);
break;
case CNDarklite:
if (vectorized) {
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
}

if (cn_config[Darklite]) {
cryptonight_darklite_2way_hash(hash0, hash1, hash0, hash1);
cryptonight_darklite_2way_hash(hash2, hash3, hash2, hash3);
} else {
cryptonight_darklite_hash(hash0, hash0);
cryptonight_darklite_hash(hash1, hash1);
cryptonight_darklite_hash(hash2, hash2);
cryptonight_darklite_hash(hash3, hash3);
}
vectorized = false;
CRYPTONIGHT_HASH(darklite, cn_config[Darklite]);
break;
case CNDark:
if (vectorized) {
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
}
if (cn_config[Dark]) {
cryptonight_dark_2way_hash(hash0, hash1, hash0, hash1);
cryptonight_dark_2way_hash(hash2, hash3, hash2, hash3);
} else {
cryptonight_dark_hash(hash0, hash0);
cryptonight_dark_hash(hash1, hash1);
cryptonight_dark_hash(hash2, hash2);
cryptonight_dark_hash(hash3, hash3);
}
vectorized = false;
CRYPTONIGHT_HASH(dark, cn_config[Dark]);
break;
case CNLite:
if (vectorized) {
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
}
if (cn_config[Lite]) {
cryptonight_lite_2way_hash(hash0, hash1, hash0, hash1);
cryptonight_lite_2way_hash(hash2, hash3, hash2, hash3);
} else {
cryptonight_lite_hash(hash0, hash0);
cryptonight_lite_hash(hash1, hash1);
cryptonight_lite_hash(hash2, hash2);
cryptonight_lite_hash(hash3, hash3);
}
vectorized = false;
CRYPTONIGHT_HASH(lite, cn_config[Lite]);
break;
case CNFast:
if (vectorized) {
dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
}
if (cn_config[Fast]) {
cryptonight_fast_2way_hash(hash0, hash1, hash0, hash1);
cryptonight_fast_2way_hash(hash2, hash3, hash2, hash3);
} else {
cryptonight_fast_hash(hash0, hash0);
cryptonight_fast_hash(hash1, hash1);
cryptonight_fast_hash(hash2, hash2);
cryptonight_fast_hash(hash3, hash3);
}
vectorized = false;
CRYPTONIGHT_HASH(fast, cn_config[Fast]);
break;
}

Expand Down Expand Up @@ -396,23 +339,17 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i *)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart);

if (opt_tune) {
if (!opt_tuned && opt_tune) {
tune(pdata, thr_id);
opt_tuned = true; // Tuned.
opt_tune = false; // Tune only once.
// Prevent error messages after tuning with --benchmark.
if (opt_benchmark) {
exit(0);
}
}

if (opt_benchmark_config) {
benchmark_configs(pdata, thr_id);
}

if (opt_benchmark) {
if (thr_id == 0) {
applog(LOG_BLUE, "Starting benchmark. Benchmark takes 300s to complete");
}
benchmark(pdata, thr_id, 0);
diff_to_hash(ptarget, 0.05 / 65536.0);
exit(0);
}

mm256_bswap32_intrlv80_4x64(vdata, pdata);
Expand Down
Loading

0 comments on commit 2d85ba6

Please sign in to comment.