From 2d85ba6e4be503cbc98e598a18becdf4a9626869 Mon Sep 17 00:00:00 2001
From: Michal Zurkowski <michal.zurkowski@gmail.com>
Date: Tue, 8 Jun 2021 02:10:59 +0200
Subject: [PATCH] 1.1.6 changes.

Refactor Cryptonight code. Use macros for similar parts in 1way and 2way methods.
Fix scratchpad fill and read prefetch. Further optimizations.
Benchmark should give more real world performance from data gathered from 16 days of mining.
Fix memory allocation for Large Pages.
Add Tuning functionality for AVX and non-AES code. Switch from 1way to 2way.
Disable MSR on non-AES binaries.
Add p2pool as dev fee collection alternative. Should be usable for China and Hong Kong region users.
Make tune functionality as default behaviour. --no-tune to disable it.
Prevent sending very old/stale shares after losing connection.
Remove --cn-config functionality.
---
 README.md                        |   1 +
 algo/gr/cryptonote/cryptonight.c |  35 ++--
 algo/gr/cryptonote/soft_aes.h    |   2 +-
 algo/gr/gr-4way.c                | 117 ++++----------
 algo/gr/gr-gate.c                | 215 +++++++++++--------------
 algo/gr/gr-gate.h                |  12 +-
 algo/gr/gr.c                     | 264 +++++++++++++++++++++++--------
 build-allarch.sh                 |   1 +
 configure                        |  20 +--
 configure.ac                     |   2 +-
 cpu-miner.c                      | 178 +++++++++++----------
 miner.h                          |  27 ++--
 virtual_memory.c                 |  24 ++-
 winbuild-cross.sh                |   1 +
 14 files changed, 480 insertions(+), 419 deletions(-)

diff --git a/README.md b/README.md
index d740fd7..1507633 100755
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ Supported Algorithms
                           decred
                           deep          Deepcoin (DCN)
                           dmd-gr        Diamond-Groestl
+                          gr            Ghost Rider (RTM)
                           groestl       Groestl coin
                           hex           x16r-hex
                           hmq1725       Espers
diff --git a/algo/gr/cryptonote/cryptonight.c b/algo/gr/cryptonote/cryptonight.c
index ef7c2e9..d6104fb 100755
--- a/algo/gr/cryptonote/cryptonight.c
+++ b/algo/gr/cryptonote/cryptonight.c
@@ -41,13 +41,6 @@ static void do_skein_hash(const void *input, size_t len, void *output) {
 static void (*const extra_hashes[4])(const void *, size_t, void *) = {
     do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
 
-static __attribute__((always_inline)) uint64_t
-__umul128(const uint64_t *a, const uint64_t *b, uint64_t *hi) {
-  unsigned __int128 r = (unsigned __int128)(*a) * (unsigned __int128)(*b);
-  *hi = r >> 64;
-  return (uint64_t)r;
-}
-
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
 // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
 static inline __m128i sl_xor(__m128i tmp1) {
@@ -185,7 +178,7 @@ static inline void explode_scratchpad(const __m128i *state, __m128i *ls,
   __m128i k[10];
 
   aes_genkey(state, k);
-  const __m128i *restrict key = __builtin_assume_aligned(k, 16);
+  const __m128i *key = __builtin_assume_aligned(k, 16);
 
   memcpy(x, state + 4, 128);
 
@@ -217,7 +210,7 @@ static inline void implode_scratchpad(const __m128i *ls, __m128i *state,
   __m128i k[10];
 
   aes_genkey(state + 2, k);
-  const __m128i *restrict key = __builtin_assume_aligned(k, 16);
+  const __m128i *key = __builtin_assume_aligned(k, 16);
 
   memcpy(x, state + 4, 128);
 
@@ -229,17 +222,29 @@ static inline void implode_scratchpad(const __m128i *ls, __m128i *state,
   for (i = 0; i < memory - PREFETCH_SIZE_B; i += 128) {
     _mm_prefetch(ls + PREFETCH_SHIFT, PREFETCH_TYPE_R);
     _mm_prefetch(ls + PREFETCH_SHIFT + WPL, PREFETCH_TYPE_R);
-    for (size_t j = 0; j < 8; ++j) {
-      x[j] = _mm_xor_si128(_mm_load_si128(ls + j), x[j]);
-    }
+    x[0] = _mm_xor_si128(ls[0], x[0]);
+    x[1] = _mm_xor_si128(ls[1], x[1]);
+    x[2] = _mm_xor_si128(ls[2], x[2]);
+    x[3] = _mm_xor_si128(ls[3], x[3]);
+    x[4] = _mm_xor_si128(ls[4], x[4]);
+    x[5] = _mm_xor_si128(ls[5], x[5]);
+    x[6] = _mm_xor_si128(ls[6], x[6]);
+    x[7] = _mm_xor_si128(ls[7], x[7]);
+    ls += WPS;
 
     aes_batch(key, x);
   }
 
   for (; i < memory; i += 128) {
-    for (size_t j = 0; j < 8; ++j) {
-      x[j] = _mm_xor_si128(_mm_load_si128(ls + j), x[j]);
-    }
+    x[0] = _mm_xor_si128(ls[0], x[0]);
+    x[1] = _mm_xor_si128(ls[1], x[1]);
+    x[2] = _mm_xor_si128(ls[2], x[2]);
+    x[3] = _mm_xor_si128(ls[3], x[3]);
+    x[4] = _mm_xor_si128(ls[4], x[4]);
+    x[5] = _mm_xor_si128(ls[5], x[5]);
+    x[6] = _mm_xor_si128(ls[6], x[6]);
+    x[7] = _mm_xor_si128(ls[7], x[7]);
+    ls += WPS;
 
     aes_batch(key, x);
   }
diff --git a/algo/gr/cryptonote/soft_aes.h b/algo/gr/cryptonote/soft_aes.h
index a2ab72c..155d764 100755
--- a/algo/gr/cryptonote/soft_aes.h
+++ b/algo/gr/cryptonote/soft_aes.h
@@ -95,7 +95,7 @@ const uint32_t saes_table[4][256] = {saes_data(saes_u0), saes_data(saes_u1),
                                      saes_data(saes_u2), saes_data(saes_u3)};
 __attribute__((aligned(16))) const uint8_t saes_sbox[256] = saes_data(saes_h0);
 
-static __attribute__((always_inline)) uint32_t sub_word(uint32_t key) {
+static __attribute__((always_inline)) inline uint32_t sub_word(uint32_t key) {
   return (saes_sbox[key >> 24] << 24) | (saes_sbox[(key >> 16) & 0xff] << 16) |
          (saes_sbox[(key >> 8) & 0xff] << 8) | saes_sbox[key & 0xff];
 }
diff --git a/algo/gr/gr-4way.c b/algo/gr/gr-4way.c
index 1ee2e46..43f176b 100755
--- a/algo/gr/gr-4way.c
+++ b/algo/gr/gr-4way.c
@@ -2,6 +2,22 @@
 
 #if defined(GR_4WAY)
 
+#define CRYPTONIGHT_HASH(variant, way)                                         \
+  if (vectorized) {                                                            \
+    dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);                       \
+  }                                                                            \
+                                                                               \
+  if (way) {                                                                   \
+    cryptonight_##variant##_2way_hash(hash0, hash1, hash0, hash1);             \
+    cryptonight_##variant##_2way_hash(hash2, hash3, hash2, hash3);             \
+  } else {                                                                     \
+    cryptonight_##variant##_hash(hash0, hash0);                                \
+    cryptonight_##variant##_hash(hash1, hash1);                                \
+    cryptonight_##variant##_hash(hash2, hash2);                                \
+    cryptonight_##variant##_hash(hash3, hash3);                                \
+  }                                                                            \
+  vectorized = false;
+
 int gr_4way_hash(void *output, const void *input, int thrid) {
   uint64_t vhash[10 * 4] __attribute__((aligned(128)));
   uint64_t vhashA[10 * 2] __attribute__((aligned(128)));
@@ -267,95 +283,22 @@ int gr_4way_hash(void *output, const void *input, int thrid) {
       vectorized = true;
       break;
     case CNTurtlelite:
-      if (vectorized) {
-        dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
-      }
-      if (cn_config[Turtlelite]) {
-        cryptonight_turtlelite_2way_hash(hash0, hash1, hash0, hash1);
-        cryptonight_turtlelite_2way_hash(hash2, hash3, hash2, hash3);
-      } else {
-        cryptonight_turtlelite_hash(hash0, hash0);
-        cryptonight_turtlelite_hash(hash1, hash1);
-        cryptonight_turtlelite_hash(hash2, hash2);
-        cryptonight_turtlelite_hash(hash3, hash3);
-      }
-      vectorized = false;
+      CRYPTONIGHT_HASH(turtlelite, cn_config[Turtlelite]);
       break;
     case CNTurtle:
-      if (vectorized) {
-        dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
-      }
-      if (cn_config[Turtle]) {
-        cryptonight_turtle_2way_hash(hash0, hash1, hash0, hash1);
-        cryptonight_turtle_2way_hash(hash2, hash3, hash2, hash3);
-      } else {
-        cryptonight_turtle_hash(hash0, hash0);
-        cryptonight_turtle_hash(hash1, hash1);
-        cryptonight_turtle_hash(hash2, hash2);
-        cryptonight_turtle_hash(hash3, hash3);
-      }
-      vectorized = false;
+      CRYPTONIGHT_HASH(turtle, cn_config[Turtle]);
       break;
     case CNDarklite:
-      if (vectorized) {
-        dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
-      }
-
-      if (cn_config[Darklite]) {
-        cryptonight_darklite_2way_hash(hash0, hash1, hash0, hash1);
-        cryptonight_darklite_2way_hash(hash2, hash3, hash2, hash3);
-      } else {
-        cryptonight_darklite_hash(hash0, hash0);
-        cryptonight_darklite_hash(hash1, hash1);
-        cryptonight_darklite_hash(hash2, hash2);
-        cryptonight_darklite_hash(hash3, hash3);
-      }
-      vectorized = false;
+      CRYPTONIGHT_HASH(darklite, cn_config[Darklite]);
       break;
     case CNDark:
-      if (vectorized) {
-        dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
-      }
-      if (cn_config[Dark]) {
-        cryptonight_dark_2way_hash(hash0, hash1, hash0, hash1);
-        cryptonight_dark_2way_hash(hash2, hash3, hash2, hash3);
-      } else {
-        cryptonight_dark_hash(hash0, hash0);
-        cryptonight_dark_hash(hash1, hash1);
-        cryptonight_dark_hash(hash2, hash2);
-        cryptonight_dark_hash(hash3, hash3);
-      }
-      vectorized = false;
+      CRYPTONIGHT_HASH(dark, cn_config[Dark]);
       break;
     case CNLite:
-      if (vectorized) {
-        dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
-      }
-      if (cn_config[Lite]) {
-        cryptonight_lite_2way_hash(hash0, hash1, hash0, hash1);
-        cryptonight_lite_2way_hash(hash2, hash3, hash2, hash3);
-      } else {
-        cryptonight_lite_hash(hash0, hash0);
-        cryptonight_lite_hash(hash1, hash1);
-        cryptonight_lite_hash(hash2, hash2);
-        cryptonight_lite_hash(hash3, hash3);
-      }
-      vectorized = false;
+      CRYPTONIGHT_HASH(lite, cn_config[Lite]);
       break;
     case CNFast:
-      if (vectorized) {
-        dintrlv_4x64_512(hash0, hash1, hash2, hash3, vhash);
-      }
-      if (cn_config[Fast]) {
-        cryptonight_fast_2way_hash(hash0, hash1, hash0, hash1);
-        cryptonight_fast_2way_hash(hash2, hash3, hash2, hash3);
-      } else {
-        cryptonight_fast_hash(hash0, hash0);
-        cryptonight_fast_hash(hash1, hash1);
-        cryptonight_fast_hash(hash2, hash2);
-        cryptonight_fast_hash(hash3, hash3);
-      }
-      vectorized = false;
+      CRYPTONIGHT_HASH(fast, cn_config[Fast]);
       break;
     }
 
@@ -396,23 +339,17 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
   __m256i *noncev = (__m256i *)vdata + 9; // aligned
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
 
-  if (opt_tune) {
+  if (!opt_tuned && opt_tune) {
     tune(pdata, thr_id);
     opt_tuned = true; // Tuned.
-    opt_tune = false; // Tune only once.
-    // Prevent error messages after tuning with --benchmark.
-    if (opt_benchmark) {
-      exit(0);
-    }
-  }
-
-  if (opt_benchmark_config) {
-    benchmark_configs(pdata, thr_id);
   }
 
   if (opt_benchmark) {
+    if (thr_id == 0) {
+      applog(LOG_BLUE, "Starting benchmark. Benchmark takes 300s to complete");
+    }
     benchmark(pdata, thr_id, 0);
-    diff_to_hash(ptarget, 0.05 / 65536.0);
+    exit(0);
   }
 
   mm256_bswap32_intrlv80_4x64(vdata, pdata);
diff --git a/algo/gr/gr-gate.c b/algo/gr/gr-gate.c
index ad8bbb7..7863a67 100755
--- a/algo/gr/gr-gate.c
+++ b/algo/gr/gr-gate.c
@@ -23,7 +23,7 @@ bool register_gr_algo(algo_gate_t *gate) {
   gate->scanhash = (void *)&scanhash_gr;
   gate->hash = (void *)&gr_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | VAES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | VAES_OPT | AVX2_OPT;
   opt_target_factor = 65536.0;
   return true;
 }
@@ -184,7 +184,7 @@ static void print_stats(const char *prefix, bool same_line) {
   // lock is not necessary.
   hashrate = bench_hashes / bench_time;
   bench_hashrate = hashrate;
-  scale_hash_for_display(&hashrate, hr_units);
+  // scale_hash_for_display(&hashrate, hr_units);
   if (same_line) {
     pthread_mutex_unlock(&applog_lock);
     printf("                      %s\t%.2lf %sH/s (%.2lfs)\t-> %.3lf %sH/s per "
@@ -195,7 +195,7 @@ static void print_stats(const char *prefix, bool same_line) {
     pthread_mutex_unlock(&applog_lock);
 
   } else {
-    applog(LOG_BLUE, "%s\t%.2lf %sH/s (%.2lfs)\t-> %.3lf %sH/s per thread.",
+    applog(LOG_BLUE, "%s \t%.2lf %sH/s (%.2lfs)\t-> %.3lf %sH/s per thread.",
            prefix, hashrate, hr_units, bench_time, hashrate / opt_n_threads,
            hr_units);
   }
@@ -222,64 +222,74 @@ static void sync_bench() { sync_lock(opt_n_threads + 1); }
 // Detached thread for changing rotation every 1.5s.
 // Prints data every rotation.
 void *statistic_thread(void *arg) {
-  bool infinite = true;
-  long sleep_time = 6000000;
-  if (arg != NULL) {
-    infinite = false;
-    sleep_time = *((long *)arg);
-  }
   struct timeval start, end, diff;
   double elapsed;
   sync_bench(); // Sync before benchmark starts.
-  sync_bench(); // Rotation change. Threads start with new rotation.
+  sync_bench(); // Rotation change sync.
   while (true) {
     gettimeofday(&start, NULL);
-    usleep(sleep_time);
+    if (arg != NULL) {
+      // Sleep for predefined time period.
+      usleep(*((long *)arg));
+    } else {
+      // Sleep portion of the whole benchmark. Portion is deretmined from
+      // real world data of each rotation time mining.
+      // This should provide more of a real world average that users can see
+      // on the pool side.
+      // Total benchmark time is 300s.
+      usleep(300000000. * time_ratio[rotation]);
+    }
     // Change rotation.
     rotation = (rotation + 1) % 20;
     gettimeofday(&end, NULL);
     timeval_subtract(&diff, &end, &start);
     elapsed = (double)diff.tv_sec + (double)diff.tv_usec / 1e6;
     bench_time += elapsed;
+    sync_bench(); // Rotation change sync.
     if (rotation == 0) {
       // Print permanently after full 20 rotations.
       print_stats("Hashrate (Avg): ", false);
-      if (!infinite) {
-        stop_benchmark =
-            true;     // Make sure it is true before other threads sync.
-        sync_bench(); // Config change sync.
-        return NULL;
-      }
+      // Make sure it is true before other threads sync.
+      stop_benchmark = true;
+      sync_bench(); // Config change sync.
+      return NULL;
     } else {
       print_stats("Hashrate (Avg): ", true);
     }
-    sync_bench();
   }
 }
 
-#if defined(GR_4WAY)
-
 static void tune_config(void *input, int thr_id, int rot) {
-  srand(time(NULL) + thr_id);
+  srand(thr_id);
   rotation = 19;
   long sleep_time = 12500000;
   pthread_t pthr;
   if (thr_id == 0) {
     pthread_create(&pthr, NULL, &statistic_thread, &sleep_time);
   }
-  uint32_t edata[4] __attribute__((aligned(64)));
+
+  uint32_t n = 10000 * thr_id;
+  uint32_t edata0[20] __attribute__((aligned(64)));
+  mm128_bswap32_80(edata0, input);
+  edata0[19] = n;
+#if defined(GR_4WAY)
   uint32_t hash[8 * 4] __attribute__((aligned(64)));
   uint32_t vdata[20 * 4] __attribute__((aligned(64)));
   __m256i *noncev = (__m256i *)vdata + 9; // aligned
   mm256_bswap32_intrlv80_4x64(vdata, input);
-  uint32_t n = 10000 * thr_id;
   *noncev = mm256_intrlv_blend_32(
       _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev);
-
+#else
+  uint32_t hash[8 * 2] __attribute__((aligned(64)));
+  uint32_t edata1[20] __attribute__((aligned(64)));
+  mm128_bswap32_80(edata1, input);
+  edata1[19] = n + 1;
+#endif
+  double hashes_done = 0.0;
   // Use CN rotation.
-  edata[1] = rand();
-  edata[2] = rand();
-  gr_getAlgoString((const uint8_t *)(&edata[1]), gr_hash_order);
+  edata0[1] = rand();
+  edata0[2] = rand();
+  gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order);
   gr_hash_order[5] = cn[rot][0] + 15;
   gr_hash_order[11] = cn[rot][1] + 15;
   gr_hash_order[17] = cn[rot][2] + 15;
@@ -291,13 +301,25 @@ static void tune_config(void *input, int thr_id, int rot) {
   sync_bench();
   sync_bench();
   while (true) {
+#if defined(GR_4WAY)
+    // Make sure nonces are increased for each hash. Same hashes will result
+    // in better data locality on CN algos leading to better/innaccurate
+    // results.
     gr_4way_hash(hash, vdata, thr_id);
     *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000));
-    n += 4;
-    pthread_mutex_lock(&stats_lock);
-    bench_hashes += 4;
-    pthread_mutex_unlock(&stats_lock);
+    hashes_done += 4.0;
+#else
+    // Increase nonce.
+    edata0[19] += 2;
+    edata1[19] += 2;
+    gr_hash(hash, edata0, edata1, thr_id);
+    hashes_done += 2.0;
+#endif
     if (rotation == 0) {
+      pthread_mutex_lock(&stats_lock);
+      bench_hashes += hashes_done;
+      pthread_mutex_unlock(&stats_lock);
+      sync_bench();
       sync_bench();
       break;
     }
@@ -307,7 +329,7 @@ static void tune_config(void *input, int thr_id, int rot) {
 static bool save_config() {
   char *filename = "tune_config";
   FILE *fd;
-  fd = fopen(filename, "w+");
+  fd = fopen(filename, "w");
   if (fd == NULL) {
     applog(LOG_ERR, "Could not save tune_config file");
     return false;
@@ -322,14 +344,6 @@ static bool save_config() {
 
 // Run tuning benchmarks and create tune_config in the end.
 void tune(void *input, int thr_id) {
-  if (thr_id == 0) {
-    // Test save empty config to see if we have permissions.
-    if (!save_config()) {
-      applog(LOG_ERR, "Check if you have permission to file 'tune_config'");
-      exit(0);
-    }
-  }
-
   for (int i = 0; i < 20; i++) {
     int best_hashrate = 0;
     if (thr_id == 0) {
@@ -346,19 +360,10 @@ void tune(void *input, int thr_id) {
       sync_conf();
       if (thr_id == 0) {
         if (best_hashrate < bench_hashrate) {
-          if (opt_debug) {
-            applog(LOG_DEBUG, "%d -> %d | %d -> %d | %d -> %d", cn[i][0],
-                   (config & 1) >> 0, cn[i][1], (config & 2) >> 1, cn[i][2],
-                   (config & 4) >> 2);
-          }
           cn_tune[i][cn_map[cn[i][0]]] = (config & 1) >> 0;
           cn_tune[i][cn_map[cn[i][1]]] = (config & 2) >> 1;
           cn_tune[i][cn_map[cn[i][2]]] = (config & 4) >> 2;
-          if (opt_debug) {
-            applog(LOG_DEBUG, "Config for rotation %d: %d %d %d %d %d %d", i,
-                   cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3],
-                   cn_tune[i][4], cn_tune[i][5]);
-          }
+
           best_hashrate = bench_hashrate;
         }
         bench_hashrate = 0;
@@ -388,46 +393,59 @@ void tune(void *input, int thr_id) {
   sync_conf();
 }
 
-#endif // __AVX2__ // GR_4WAY
-
 void benchmark(void *input, int thr_id, long sleep_time) {
-  srand(time(NULL) + thr_id);
+  for (int i = 0; i < 160; i++) {
+    ((uint8_t *)input)[i] = i;
+  }
+
+  srand(thr_id);
   pthread_t pthr;
   if (thr_id == 0) {
     pthread_create(&pthr, NULL, &statistic_thread,
                    sleep_time ? &sleep_time : NULL);
   }
 
-  uint32_t edata[20] __attribute__((aligned(64)));
+  uint32_t n = 10000 * thr_id;
+  uint32_t edata0[20] __attribute__((aligned(64)));
+  mm128_bswap32_80(edata0, input);
+  edata0[19] = n;
 #if defined(GR_4WAY)
   uint32_t hash[8 * 4] __attribute__((aligned(64)));
   uint32_t vdata[20 * 4] __attribute__((aligned(64)));
   __m256i *noncev = (__m256i *)vdata + 9; // aligned
   mm256_bswap32_intrlv80_4x64(vdata, input);
-  uint32_t n = 10000 * thr_id;
   *noncev = mm256_intrlv_blend_32(
       _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev);
 #else
-  uint32_t hash[8] __attribute__((aligned(64)));
-  mm128_bswap32_80(edata, input);
+  uint32_t hash[8 * 2] __attribute__((aligned(64)));
+  uint32_t edata1[20] __attribute__((aligned(64)));
+  mm128_bswap32_80(edata1, input);
+  edata1[19] = n + 1;
 #endif
   uint8_t local_rotation = 255;
+  double hashes_done = 0.0;
 
   sync_bench(); // Sync before benchmark starts.
   while (true) {
-    // gr_hash_order is calculated once per rotation as that is how it is done
-    // in scanhash_gr.
     if (likely(local_rotation != rotation)) {
-      local_rotation = rotation;
+      pthread_mutex_lock(&stats_lock);
+#if defined(GR_4WAY)
+      bench_hashes += hashes_done + 2.0;
+#else
+      bench_hashes += hashes_done + 1.0;
+#endif
+      pthread_mutex_unlock(&stats_lock);
+      hashes_done = 0.0;
       // Change first part of the hash to get different core rotation.
-      edata[1] = rand();
-      edata[2] = rand();
-
+      for (int i = 1; i < 5 + 1; ++i) {
+        edata0[i] = rand();
+      }
       // Use new rotation.
-      gr_getAlgoString((const uint8_t *)(&edata[1]), gr_hash_order);
+      gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order);
       gr_hash_order[5] = cn[rotation][0] + 15;
       gr_hash_order[11] = cn[rotation][1] + 15;
       gr_hash_order[17] = cn[rotation][2] + 15;
+
       if (opt_tuned) {
         select_tuned_config(thr_id);
       }
@@ -436,11 +454,19 @@ void benchmark(void *input, int thr_id, long sleep_time) {
       AllocateNeededMemory();
 
       sync_bench(); // Rotation change sync.
-      if (rotation == 0) {
+      if (rotation == 0 && local_rotation != 255) {
+        sync_bench(); // Rotation change sync.
         if (likely(stop_benchmark)) {
           return;
         }
       }
+      local_rotation = rotation;
+    } else {
+#if defined(GR_4WAY)
+      hashes_done += 4.0;
+#else
+      hashes_done += 2.0;
+#endif
     }
 #if defined(GR_4WAY)
     // Make sure nonces are increased for each hash. Same hashes will result
@@ -448,64 +474,11 @@ void benchmark(void *input, int thr_id, long sleep_time) {
     // results.
     gr_4way_hash(hash, vdata, thr_id);
     *noncev = _mm256_add_epi32(*noncev, m256_const1_64(0x0000000400000000));
-    n += 4;
 #else
     // Increase nonce.
-    edata[19]++;
-    gr_hash(hash, edata, thr_id);
-#endif
-    // Calculated hash. Do not count half finished hahshes.
-    pthread_mutex_lock(&stats_lock);
-#if defined(GR_4WAY)
-    bench_hashes += 4;
-#else
-    bench_hashes += 1;
+    edata0[19] += 2;
+    edata1[19] += 2;
+    gr_hash(hash, edata0, edata1, thr_id);
 #endif
-    pthread_mutex_unlock(&stats_lock);
-  }
-}
-
-void benchmark_configs(void *input, int thr_id) {
-  int best_config = 0;
-  int best_hashrate = 0;
-
-  for (int i = 0; i < (1 << 6); i++) {
-    // Set new cn_config to test.
-    cn_config[0] = (i & 1) >> 0;
-    cn_config[1] = (i & 2) >> 1;
-    cn_config[2] = (i & 4) >> 2;
-    cn_config[3] = (i & 8) >> 3;
-    cn_config[4] = (i & 16) >> 4;
-    cn_config[5] = (i & 32) >> 5;
-    if (!thr_id) {
-      applog(LOG_NOTICE, "Testing Cryptonigh --cn-config %d,%d,%d,%d,%d,%d",
-             cn_config[0], cn_config[1], cn_config[2], cn_config[3],
-             cn_config[4], cn_config[5]);
-
-      // Reset benchamrk variables to default.
-      bench_time = 0.0;
-      bench_hashes = 0.0;
-      bench_hashrate = 0.0;
-      rotation = 0;
-    }
-    sync_conf();
-    stop_benchmark = false;
-    sync_conf();
-    benchmark(input, thr_id, 1000000);
-
-    // Check if this config is better.
-    if (thr_id == 0) {
-      if (bench_hashrate > best_hashrate) {
-        best_hashrate = bench_hashrate;
-        best_config = i;
-      }
-    }
-  }
-  // Show best config.
-  if (!thr_id) {
-    applog(LOG_NOTICE, "Best --cn-config %d,%d,%d,%d,%d,%d",
-           (best_config & 1) >> 0, (best_config & 2) >> 1,
-           (best_config & 4) >> 2, (best_config & 8) >> 3,
-           (best_config & 16) >> 4, (best_config & 32) >> 5);
   }
 }
diff --git a/algo/gr/gr-gate.h b/algo/gr/gr-gate.h
index be844b7..af87f35 100755
--- a/algo/gr/gr-gate.h
+++ b/algo/gr/gr-gate.h
@@ -25,6 +25,7 @@
 #include "cryptonote/cryptonight.h"
 #include "simd-utils.h"
 #include <stdint.h>
+#include <stdlib.h>
 
 #if defined(__AES__)
 #include "algo/echo/aes_ni/hash_api.h"
@@ -117,7 +118,7 @@ typedef union _gr_4way_context_overlay gr_4way_context_overlay;
 
 extern __thread gr_4way_context_overlay gr_4way_ctx;
 
-int gr_4way_hash(void *state, const void *input, int thrid);
+int gr_4way_hash(void *hash, const void *input, int thrid);
 int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr);
 
@@ -151,13 +152,20 @@ typedef union _gr_context_overlay gr_context_overlay;
 
 extern __thread gr_context_overlay gr_ctx;
 
-int gr_hash(void *state, const void *input, int thrid);
+int gr_hash(void *hash, const void *input0, const void *input1, int thrid);
 int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done,
                 struct thr_info *mythr);
 
 // Memory state
 extern __thread uint8_t *hp_state;
 
+// Time ratio for each kind of block/rotation.
+// Data gathered from 16 days of mining, 18251 blocks.
+static const double time_ratio[20] = {
+    0.081253, 0.077720, 0.048476, 0.046917, 0.077396, 0.044738, 0.048130,
+    0.046897, 0.046318, 0.027358, 0.080602, 0.048928, 0.048283, 0.048863,
+    0.044444, 0.027389, 0.048813, 0.050674, 0.028581, 0.028219};
+
 // Values for 20 CN rotations.
 static const uint8_t cn[20][3] = {
     {0, 1, 2}, {0, 1, 3}, {0, 1, 4}, {0, 1, 5}, {0, 2, 3}, {0, 2, 4}, {0, 2, 5},
diff --git a/algo/gr/gr.c b/algo/gr/gr.c
index 9035c72..426fa69 100755
--- a/algo/gr/gr.c
+++ b/algo/gr/gr.c
@@ -1,110 +1,221 @@
 #include "gr-gate.h"
 
-int gr_hash(void *output, const void *input, int thrid) {
-  uint64_t hash[8] __attribute__((aligned(64)));
+#define CRYPTONIGHT_HASH(variant, way)                                         \
+  if (way) {                                                                   \
+    cryptonight_##variant##_2way_hash(hash0, hash1, hash0, hash1);             \
+  } else {                                                                     \
+    cryptonight_##variant##_hash(hash0, hash0);                                \
+    cryptonight_##variant##_hash(hash1, hash1);                                \
+  }
+
+#define CORE_HASH(hash, input, output, size)                                   \
+  sph_##hash##512_init(&ctx.hash);                                             \
+  sph_##hash##512(&ctx, input, size);                                          \
+  sph_##hash##512_close(&ctx, output);
+
+int gr_hash(void *output, const void *input0, const void *input1, int thrid) {
+  uint64_t hash0[10] __attribute__((aligned(64)));
+  uint64_t hash1[10] __attribute__((aligned(64)));
   gr_context_overlay ctx;
   memcpy(&ctx, &gr_ctx, sizeof(ctx));
-  void *in = (void *)input;
-  int size = 80;
 
   for (int i = 0; i < 15 + 3; i++) {
     const uint8_t algo = gr_hash_order[i];
     switch (algo) {
     case BLAKE:
-      sph_blake512_init(&ctx.blake);
-      sph_blake512(&ctx.blake, in, size);
-      sph_blake512_close(&ctx.blake, hash);
+      if (i == 0) {
+        CORE_HASH(blake, input0, hash0, 80);
+        CORE_HASH(blake, input1, hash1, 80);
+      } else {
+        CORE_HASH(blake, hash0, hash0, 64);
+        CORE_HASH(blake, hash1, hash1, 64);
+      }
       break;
     case BMW:
-      sph_bmw512_init(&ctx.bmw);
-      sph_bmw512(&ctx.bmw, in, size);
-      sph_bmw512_close(&ctx.bmw, hash);
+      if (i == 0) {
+        CORE_HASH(bmw, input0, hash0, 80);
+        CORE_HASH(bmw, input1, hash1, 80);
+      } else {
+        CORE_HASH(bmw, hash0, hash0, 64);
+        CORE_HASH(bmw, hash1, hash1, 64);
+      }
       break;
     case GROESTL:
 #if defined(__AES__)
-      groestl512_full(&ctx.groestl, (char *)hash, (char *)in, size << 3);
+      if (i == 0) {
+        groestl512_full(&ctx.groestl, (char *)hash0, (char *)input0, 640);
+        groestl512_full(&ctx.groestl, (char *)hash1, (char *)input1, 640);
+      } else {
+        groestl512_full(&ctx.groestl, (char *)hash0, (char *)hash0, 512);
+        groestl512_full(&ctx.groestl, (char *)hash1, (char *)hash1, 512);
+      }
 #else
-      sph_groestl512_init(&ctx.groestl);
-      sph_groestl512(&ctx.groestl, in, size);
-      sph_groestl512_close(&ctx.groestl, hash);
+      if (i == 0) {
+        CORE_HASH(groestl, input0, hash0, 80);
+        CORE_HASH(groestl, input1, hash1, 80);
+      } else {
+        CORE_HASH(groestl, hash0, hash0, 64);
+        CORE_HASH(groestl, hash1, hash1, 64);
+      }
 #endif
       break;
     case SKEIN:
-      sph_skein512_init(&ctx.skein);
-      sph_skein512(&ctx.skein, in, size);
-      sph_skein512_close(&ctx.skein, hash);
+      if (i == 0) {
+        CORE_HASH(skein, input0, hash0, 80);
+        CORE_HASH(skein, input1, hash1, 80);
+      } else {
+        CORE_HASH(skein, hash0, hash0, 64);
+        CORE_HASH(skein, hash1, hash1, 64);
+      }
       break;
     case JH:
-      sph_jh512_init(&ctx.jh);
-      sph_jh512(&ctx.jh, in, size);
-      sph_jh512_close(&ctx.jh, hash);
+      if (i == 0) {
+        CORE_HASH(jh, input0, hash0, 80);
+        CORE_HASH(jh, input1, hash1, 80);
+      } else {
+        CORE_HASH(jh, hash0, hash0, 64);
+        CORE_HASH(jh, hash1, hash1, 64);
+      }
       break;
     case KECCAK:
-      sph_keccak512_init(&ctx.keccak);
-      sph_keccak512(&ctx.keccak, in, size);
-      sph_keccak512_close(&ctx.keccak, hash);
+      if (i == 0) {
+        CORE_HASH(keccak, input0, hash0, 80);
+        CORE_HASH(keccak, input1, hash1, 80);
+      } else {
+        CORE_HASH(keccak, hash0, hash0, 64);
+        CORE_HASH(keccak, hash1, hash1, 64);
+      }
       break;
     case LUFFA:
-      luffa_full(&ctx.luffa, (BitSequence *)hash, 512, (const BitSequence *)in,
-                 size);
+      if (i == 0) {
+        luffa_full(&ctx.luffa, (BitSequence *)hash0, 512,
+                   (const BitSequence *)input0, 80);
+        luffa_full(&ctx.luffa, (BitSequence *)hash1, 512,
+                   (const BitSequence *)input1, 80);
+      } else {
+        luffa_full(&ctx.luffa, (BitSequence *)hash0, 512,
+                   (const BitSequence *)hash0, 64);
+        luffa_full(&ctx.luffa, (BitSequence *)hash1, 512,
+                   (const BitSequence *)hash1, 64);
+      }
       break;
     case CUBEHASH:
-      cubehash_full(&ctx.cube, (byte *)hash, 512, (byte *)in, size);
+      if (i == 0) {
+        cubehash_full(&ctx.cube, (byte *)hash0, 512, (byte *)input0, 80);
+        cubehash_full(&ctx.cube, (byte *)hash1, 512, (byte *)input1, 80);
+      } else {
+        cubehash_full(&ctx.cube, (byte *)hash0, 512, (byte *)hash0, 64);
+        cubehash_full(&ctx.cube, (byte *)hash1, 512, (byte *)hash1, 64);
+      }
       break;
     case SHAVITE:
-      shavite512_full(&ctx.shavite, hash, in, size);
+      if (i == 0) {
+        shavite512_full(&ctx.shavite, hash0, input0, 80);
+        shavite512_full(&ctx.shavite, hash1, input1, 80);
+      } else {
+        shavite512_full(&ctx.shavite, hash0, hash0, 64);
+        shavite512_full(&ctx.shavite, hash1, hash1, 64);
+      }
       break;
     case SIMD:
-      simd_full(&ctx.simd, (BitSequence *)hash, (const BitSequence *)in,
-                size << 3);
+      if (i == 0) {
+        simd_full(&ctx.simd, (BitSequence *)hash0, (const BitSequence *)input0,
+                  640);
+        simd_full(&ctx.simd, (BitSequence *)hash1, (const BitSequence *)input1,
+                  640);
+      } else {
+        simd_full(&ctx.simd, (BitSequence *)hash0, (const BitSequence *)hash0,
+                  512);
+        simd_full(&ctx.simd, (BitSequence *)hash1, (const BitSequence *)hash1,
+                  512);
+      }
       break;
     case ECHO:
 #if defined(__AES__)
-      echo_full(&ctx.echo, (BitSequence *)hash, 512, (const BitSequence *)in,
-                size);
+      if (i == 0) {
+        echo_full(&ctx.echo, (BitSequence *)hash0, 512,
+                  (const BitSequence *)input0, 80);
+        echo_full(&ctx.echo, (BitSequence *)hash1, 512,
+                  (const BitSequence *)input1, 80);
+      } else {
+        echo_full(&ctx.echo, (BitSequence *)hash0, 512,
+                  (const BitSequence *)hash0, 64);
+        echo_full(&ctx.echo, (BitSequence *)hash1, 512,
+                  (const BitSequence *)hash1, 64);
+      }
 #else
-      sph_echo512_init(&ctx.echo);
-      sph_echo512(&ctx.echo, in, size);
-      sph_echo512_close(&ctx.echo, hash);
+      if (i == 0) {
+        CORE_HASH(echo, input0, hash0, 80);
+        CORE_HASH(echo, input1, hash1, 80);
+      } else {
+        CORE_HASH(echo, hash0, hash0, 64);
+        CORE_HASH(echo, hash1, hash1, 64);
+      }
 #endif
       break;
     case HAMSI:
-      sph_hamsi512_init(&ctx.hamsi);
-      sph_hamsi512(&ctx.hamsi, in, size);
-      sph_hamsi512_close(&ctx.hamsi, hash);
+      if (i == 0) {
+        CORE_HASH(hamsi, input0, hash0, 80);
+        CORE_HASH(hamsi, input1, hash1, 80);
+      } else {
+        CORE_HASH(hamsi, hash0, hash0, 64);
+        CORE_HASH(hamsi, hash1, hash1, 64);
+      }
       break;
     case FUGUE:
 #if defined(__AES__)
-      fugue512_full(&ctx.fugue, hash, in, size);
+      if (i == 0) {
+        fugue512_full(&ctx.fugue, hash0, input0, 80);
+        fugue512_full(&ctx.fugue, hash1, input1, 80);
+      } else {
+        fugue512_full(&ctx.fugue, hash0, hash0, 64);
+        fugue512_full(&ctx.fugue, hash1, hash1, 64);
+      }
 #else
-      sph_fugue512_full(&ctx.fugue, hash, in, size);
+      if (i == 0) {
+        sph_fugue512_full(&ctx.fugue, hash0, input0, 80);
+        sph_fugue512_full(&ctx.fugue, hash1, input1, 80);
+      } else {
+        sph_fugue512_full(&ctx.fugue, hash0, hash0, 64);
+        sph_fugue512_full(&ctx.fugue, hash1, hash1, 64);
+      }
 #endif
       break;
     case SHABAL:
-      sph_shabal512_init(&ctx.shabal);
-      sph_shabal512(&ctx.shabal, in, size);
-      sph_shabal512_close(&ctx.shabal, hash);
+      if (i == 0) {
+        CORE_HASH(shabal, input0, hash0, 80);
+        CORE_HASH(shabal, input1, hash1, 80);
+      } else {
+        CORE_HASH(shabal, hash0, hash0, 64);
+        CORE_HASH(shabal, hash1, hash1, 64);
+      }
       break;
     case WHIRLPOOL:
-      sph_whirlpool512_full(&ctx.whirlpool, hash, in, size);
+      if (i == 0) {
+        sph_whirlpool512_full(&ctx.whirlpool, hash0, input0, 80);
+        sph_whirlpool512_full(&ctx.whirlpool, hash1, input1, 80);
+      } else {
+        sph_whirlpool512_full(&ctx.whirlpool, hash0, hash0, 64);
+        sph_whirlpool512_full(&ctx.whirlpool, hash1, hash1, 64);
+      }
       break;
     case CNTurtlelite:
-      cryptonight_turtlelite_hash(in, hash);
+      CRYPTONIGHT_HASH(turtlelite, cn_config[Turtlelite]);
       break;
     case CNTurtle:
-      cryptonight_turtle_hash(in, hash);
+      CRYPTONIGHT_HASH(turtle, cn_config[Turtle]);
       break;
     case CNDarklite:
-      cryptonight_darklite_hash(in, hash);
+      CRYPTONIGHT_HASH(darklite, cn_config[Darklite]);
       break;
     case CNDark:
-      cryptonight_dark_hash(in, hash);
+      CRYPTONIGHT_HASH(dark, cn_config[Dark]);
       break;
     case CNLite:
-      cryptonight_lite_hash(in, hash);
+      CRYPTONIGHT_HASH(lite, cn_config[Lite]);
       break;
     case CNFast:
-      cryptonight_fast_hash(in, hash);
+      CRYPTONIGHT_HASH(fast, cn_config[Fast]);
       break;
     }
 
@@ -115,41 +226,47 @@ int gr_hash(void *output, const void *input, int thrid) {
       }
       return 0;
     }
-    in = (void *)hash;
-    size = 64;
   }
-  memcpy(output, hash, 32);
+  memcpy(output, hash0, 32);
+  memcpy(output + 32, hash1, 32);
   return 1;
 }
 
 int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done,
                 struct thr_info *mythr) {
 
-  uint32_t _ALIGN(128) hash32[8];
-  uint32_t _ALIGN(128) edata[20];
+  uint32_t hash[2 * 8] __attribute__((aligned(64)));
+  uint32_t edata0[20] __attribute__((aligned(64)));
+  uint32_t edata1[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
+  const uint32_t last_nonce = max_nonce - 2;
   const int thr_id = mythr->id;
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
 
-  if (opt_benchmark_config) {
-    benchmark_configs(pdata, thr_id);
+  if (!opt_tuned && opt_tune) {
+    tune(pdata, thr_id);
+    opt_tuned = true; // Tuned.
   }
 
   if (opt_benchmark) {
+    if (thr_id == 0) {
+      applog(LOG_BLUE, "Starting benchmark. Benchmark takes 300s to complete");
+    }
     benchmark(pdata, thr_id, 0);
-    diff_to_hash(ptarget, 0.05 / 65536.0);
+    exit(0);
   }
 
-  mm128_bswap32_80(edata, pdata);
+  mm128_bswap32_80(edata0, pdata);
+  mm128_bswap32_80(edata1, pdata);
 
   // Check if algorithm order changed.
   static __thread uint32_t s_ntime = UINT32_MAX;
   if (s_ntime != pdata[17]) {
     uint32_t ntime = swab32(pdata[17]);
-    gr_getAlgoString((const uint8_t *)(&edata[1]), gr_hash_order);
+    gr_getAlgoString((const uint8_t *)(&edata0[1]), gr_hash_order);
     s_ntime = ntime;
     if (opt_debug && !thr_id) {
       char order[100];
@@ -158,26 +275,35 @@ int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done,
       }
       applog(LOG_DEBUG, "hash order %s (%08x)", order, ntime);
     }
+    if (opt_tuned) {
+      select_tuned_config(thr_id);
+    }
   }
 
   // Allocates hp_state for Cryptonight algorithms.
   // Needs to be run AFTER gr_hash_order is set!
   AllocateNeededMemory();
 
+  edata0[19] = nonce;
+  edata1[19] = nonce + 1;
+
   do {
-    edata[19] = nonce;
-    if (gr_hash(hash32, edata, thr_id)) {
-      if (unlikely(valid_hash(hash32, ptarget))) {
-        if (opt_debug) {
-          applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf",
-                 bswap_32(nonce), hash_to_diff(hash32));
+    if (gr_hash(hash, edata0, edata1, thr_id)) {
+      for (int i = 0; i < 2; i++) {
+        if (unlikely(valid_hash(hash + (i << 3), ptarget))) {
+          if (opt_debug) {
+            applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf",
+                   bswap_32(nonce + i), hash_to_diff(hash + (i << 3)));
+          }
+          pdata[19] = bswap_32(nonce + i);
+          submit_solution(work, hash + (i << 3), mythr);
         }
-        pdata[19] = bswap_32(nonce);
-        submit_solution(work, hash32, mythr);
       }
     }
-    nonce++;
-  } while (likely(nonce < max_nonce && !(*restart)));
+    edata0[19] += 2;
+    edata1[19] += 2;
+    nonce += 2;
+  } while (likely((nonce < last_nonce) && !(*restart)));
   pdata[19] = nonce;
   *hashes_done = pdata[19] - first_nonce;
   return 0;
diff --git a/build-allarch.sh b/build-allarch.sh
index c7ba672..63674bf 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -31,6 +31,7 @@ mv cpuminer bin/unix/${4}/cpuminer-${2}
 
 }
 
+
 #Non-AES
 # Generic SSE2
 compile "x86-64" "sse2" "-msse"
diff --git a/configure b/configure
index a3e422c..ee92ab2 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt-gr 1.1.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt-gr 1.1.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt-gr'
 PACKAGE_TARNAME='cpuminer-opt-gr'
-PACKAGE_VERSION='1.1.5'
-PACKAGE_STRING='cpuminer-opt-gr 1.1.5'
+PACKAGE_VERSION='1.1.6'
+PACKAGE_STRING='cpuminer-opt-gr 1.1.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt-gr 1.1.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt-gr 1.1.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt-gr 1.1.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt-gr 1.1.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt-gr configure 1.1.5
+cpuminer-opt-gr configure 1.1.6
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt-gr $as_me 1.1.5, which was
+It was created by cpuminer-opt-gr $as_me 1.1.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt-gr'
- VERSION='1.1.5'
+ VERSION='1.1.6'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt-gr $as_me 1.1.5, which was
+This file was extended by cpuminer-opt-gr $as_me 1.1.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt-gr config.status 1.1.5
+cpuminer-opt-gr config.status 1.1.6
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index aa19f4d..c5ec5d3 100755
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt-gr], [1.1.5])
+AC_INIT([cpuminer-opt-gr], [1.1.6])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 3653079..ed7d757 100755
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -42,8 +42,8 @@
 #include <unistd.h>
 
 #ifdef WIN32
-#include <windows.h>
 #include <winsock2.h>
+#include <windows.h>
 #endif
 
 #ifdef _MSC_VER
@@ -152,8 +152,8 @@ bool stratum_problem = false;
 __thread uint8_t cn_config[6] = {0, 0, 0, 0, 0, 0};
 uint8_t cn_config_global[6] = {0, 0, 0, 0, 0, 0};
 
-bool opt_tune = false;
 bool opt_tuned = false;
+bool opt_tune = true;
 uint8_t cn_tune[20][6];
 
 // pk_buffer_size is used as a version selector by b58 code, therefore
@@ -1424,17 +1424,27 @@ static bool stratum_check(bool reset) {
         !stratum_subscribe(&stratum) ||
         !stratum_authorize(&stratum, rpc_user, rpc_pass)) {
       stratum_disconnect(&stratum);
-      if (opt_retries >= 0 && ++failures > opt_retries) {
+      failures++;
+      if (opt_retries >= 0 && failures > opt_retries) {
         applog(LOG_ERR, "...terminating workio thread");
         tq_push(thr_info[work_thr_id].q, NULL);
         pthread_mutex_unlock(&stratum_lock);
-        stratum_problem = false;
+        stratum_problem = true;
+        return false;
+      } else if (failures >= 10) {
+        // This should prevent stratum recheck during Dev fee.
+        // If there is a problem with dev fee stratum and the miner is currently
+        // collecting it, it can loop infinitely until dev fee stratum comes
+        // back alive. It should exit as maybe dev fee ended and user pool
+        // should work.`
+        pthread_mutex_unlock(&stratum_lock);
+        stratum_problem = true;
         return false;
       }
       if (!opt_benchmark) {
         restart_threads();
         stratum_problem = true;
-        applog(LOG_ERR, "...retry after %d secondssss", opt_fail_pause);
+        applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
       }
       sleep(opt_fail_pause);
     } else {
@@ -1449,6 +1459,17 @@ static bool stratum_check(bool reset) {
 static bool submit_upstream_work(CURL *curl, struct work *work) {
   if (have_stratum) {
     char req[JSON_BUF_LEN];
+
+    // Check if the work that is to be submitted it stale already or not.
+    if ((work->data[algo_gate.ntime_index] !=
+         g_work.data[algo_gate.ntime_index]) ||
+        stratum_problem || g_work_time == 0) {
+      applog(LOG_WARNING, "Skip stale share.");
+      pthread_mutex_lock(&stats_lock);
+      stale_share_count++;
+      pthread_mutex_unlock(&stats_lock);
+      return false;
+    }
     pthread_mutex_lock(&stratum_lock);
     stratum.sharediff = work->sharediff;
     algo_gate.build_stratum_request(req, work, &stratum);
@@ -1723,6 +1744,15 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) {
 static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) {
   int failures = 0;
 
+  if ((wc->u.work->data[algo_gate.ntime_index] !=
+       g_work.data[algo_gate.ntime_index]) ||
+      stratum_problem || g_work_time == 0) {
+    applog(LOG_WARNING, "Skip stale share.");
+    pthread_mutex_lock(&stats_lock);
+    stale_share_count++;
+    pthread_mutex_unlock(&stats_lock);
+    return true;
+  }
   /* submit solution to bitcoin via JSON-RPC */
   while (!submit_upstream_work(curl, wc->u.work)) {
     if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
@@ -1732,6 +1762,16 @@ static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) {
     /* pause, then restart work-request loop */
     if (!opt_benchmark)
       applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+
+    if ((wc->u.work->data[algo_gate.ntime_index] !=
+         g_work.data[algo_gate.ntime_index]) ||
+        stratum_problem || g_work_time == 0) {
+      applog(LOG_WARNING, "Skip stale share.");
+      pthread_mutex_lock(&stats_lock);
+      stale_share_count++;
+      pthread_mutex_unlock(&stats_lock);
+      return true;
+    }
     sleep(opt_fail_pause);
   }
   return true;
@@ -2739,20 +2779,21 @@ char *rpc_url_original = NULL;
 // Data about dev wallets.
 // idx 0 - Ausminer
 // idx 1 - Delgon
-const uint8_t max_idx = 3;
+const uint8_t max_idx = 4;
 uint8_t donation_url_idx[2] = {0, 0};
-char *donation_url_pattern[2][3] = {{"r-pool", "suprnova", "ausminers"},
-                                    {"r-pool", "suprnova", "ausminers"}};
-char *donation_url[2][3] = {
+char *donation_url_pattern[2][4] = {
+    {"r-pool", "suprnova", "ausminers", "p2pool"},
+    {"r-pool", "suprnova", "ausminers", "p2pool"}};
+char *donation_url[2][4] = {
     {"stratum+tcp://r-pool.net:3008", "stratum+tcp://rtm.suprnova.cc:6273",
-     "stratum+tcp://rtm.ausminers.com:3001"},
+     "stratum+tcp://rtm.ausminers.com:3001", "stratum+tcp://p2pool.co:3008"},
     {"stratum+tcp://r-pool.net:3008", "stratum+tcp://rtm.suprnova.cc:6273",
-     "stratum+tcp://rtm.ausminers.com:3001"}};
+     "stratum+tcp://rtm.ausminers.com:3001", "stratum+tcp://p2pool.co:3008"}};
 char *donation_userRTM[2] = {"RXq9v8WbMLZaGH79GmK2oEdc33CTYkvyoZ",
                              "RQKcAZBtsSacMUiGNnbk3h3KJAN94tstvt"};
 char *donation_userBUTK[2] = {"XdFVd4X4Ru688UVtKetxxJPD54hPfemhxg",
                               "XeMjEpWscVu2A5kj663Tqtn2d7cPYYXnDN"};
-char *donation_pass[3] = {"x", "x", "x"};
+char *donation_pass[4] = {"x", "x", "x", "x"};
 bool enable_donation = true;
 double donation_percent = 1.0;
 int dev_turn = 0;
@@ -2838,7 +2879,7 @@ static bool donation_connect() {
       applog(LOG_BLUE, "Stratum connection established");
     }
 
-    if (stratum.curl != NULL && false) {
+    if (stratum.curl != NULL) {
       pthread_mutex_unlock(&stratum_lock);
       return true;
     } else {
@@ -2936,7 +2977,8 @@ static void *stratum_thread(void *userdata) {
     pthread_mutex_lock(&stratum_lock);
     if (!stratum_check(false)) {
       pthread_mutex_unlock(&stratum_lock);
-      goto out;
+      continue;
+      // goto out;
     }
     pthread_mutex_unlock(&stratum_lock);
 
@@ -2983,8 +3025,8 @@ static void show_credits() {
   printf("     with Ghostrider Algo SSE&AVX2 by Ausminer & Delgon.\n");
   printf("     Jay D Dee's BTC donation address: "
          "12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
-  printf(
-      "     RTM Donations happen 1 min every 100 min (-d X to increase)\n\n");
+  printf("     RTM Donations happen for 1 min every 100 min (-d X to "
+         "increase)\n\n");
 }
 
 #define check_cpu_capability() cpu_capability(false)
@@ -3008,14 +3050,14 @@ static bool cpu_capability(bool display_only) {
   bool sw_has_sha = false;
   bool sw_has_vaes = false;
   set_t algo_features = algo_gate.optimizations;
-  bool algo_has_sse2 = true;     // set_incl( SSE2_OPT,    algo_features );
-  bool algo_has_aes = true;      // set_incl( AES_OPT,     algo_features );
-  bool algo_has_sse42 = true;    // set_incl( SSE42_OPT,   algo_features );
-  bool algo_has_avx2 = true;     // set_incl( AVX2_OPT,    algo_features );
-  bool algo_has_avx512 = false;  // set_incl( AVX512_OPT,  algo_features );
-  bool algo_has_sha = false;     // set_incl( SHA_OPT,     algo_features );
-  bool algo_has_vaes = false;    // set_incl( VAES_OPT,    algo_features );
-  bool algo_has_vaes256 = false; // set_incl( VAES256_OPT, algo_features );
+  bool algo_has_sse2 = set_incl(SSE2_OPT, algo_features);
+  bool algo_has_aes = set_incl(AES_OPT, algo_features);
+  bool algo_has_sse42 = set_incl(SSE42_OPT, algo_features);
+  bool algo_has_avx2 = set_incl(AVX2_OPT, algo_features);
+  bool algo_has_avx512 = set_incl(AVX512_OPT, algo_features);
+  bool algo_has_sha = set_incl(SHA_OPT, algo_features);
+  bool algo_has_vaes = set_incl(VAES_OPT, algo_features);
+  bool algo_has_vaes256 = set_incl(VAES256_OPT, algo_features);
   bool use_aes;
   bool use_sse2;
   bool use_sse42;
@@ -3658,69 +3700,15 @@ void parse_arg(int key, char *arg) {
     exit(0);
   case 'h':
     show_usage_and_exit(0);
-  case 1101: // cn-config
-#ifndef __AVX2__
-    applog(LOG_ERR, "--cn-config requires AVX2+ instruction set");
-    show_usage_and_exit(1);
-#endif
-    // arg - light / medium / heavy
-    if (strcmp(arg, "light") == 0) {
-      memset(cn_config_global, 0, 6);
-      return;
-    } else if (strcmp(arg, "medium") == 0) {
-      memset(cn_config_global, 1, 6);
-      cn_config_global[4] = 0; // Lite
-      cn_config_global[5] = 0; // Fast
-      return;
-    } else if (strcmp(arg, "heavy") == 0) {
-      memset(cn_config_global, 1, 6);
-      return;
-    }
-    // arg - list like 1,1,0,0,1,1
-    // Custom list of which variants should use 2way.
-    char *cn = strtok(arg, ",");
-    int count = 0;
-    while (cn != NULL && count < 6) {
-      v = atoi(cn);
-      if (v != 0 && v != 1) {
-        show_usage_and_exit(1);
-      }
-      cn_config_global[count++] = v;
-      cn = strtok(NULL, ",");
-    }
-    if (count != 6) {
-      show_usage_and_exit(1);
-    }
-    break;
-  case 1102: // benchmark-config
-#ifndef __AVX2__
-    applog(LOG_ERR, "--benchmark-config requires AVX2+ instruction set");
-    show_usage_and_exit(1);
-#endif
-    opt_benchmark = true;
-    opt_benchmark_config = true;
-    want_longpoll = false;
-    want_stratum = false;
-    have_stratum = false;
-    break;
-  case 1103: // tune
-#ifndef __AVX2__
-    applog(LOG_ERR, "--tune requires AVX2+ instruction set");
-
-    show_usage_and_exit(1);
-#endif
-    opt_tune = true;
+  case 1103: // no-tune
+    opt_tune = false;
     break;
   case 1104: // tune-config
-#ifndef __AVX2__
-    applog(LOG_ERR, "--tune-config requires AVX2+ instruction set");
-    show_usage_and_exit(1);
-#endif
     opt_tuned = true;
     if (!load_tune_config(arg)) {
       show_usage_and_exit(1);
     } else {
-      applog(LOG_BLUE, "Tune config loaded succesfully");
+      applog(LOG_BLUE, "Tune config \'%s\' loaded succesfully", arg);
     }
     break;
   default:
@@ -3847,11 +3835,6 @@ int main(int argc, char *argv[]) {
 
   donation_time_start = now + 60 + (rand() % 60);
   donation_time_stop = donation_time_start + (60 * donation_percent);
-  if (opt_tune) {
-    // Tuning takes ~33 minutes. Add it to the donation timers.
-    donation_time_start += (35 * 60);
-    donation_time_stop += (35 * 60);
-  }
 
   // Switch off donations if it is not using GR Algo
   if (opt_algo != ALGO_GR || opt_benchmark) {
@@ -4057,7 +4040,8 @@ int main(int argc, char *argv[]) {
   }
 
   if (!opt_quiet && (opt_n_threads < num_cpus)) {
-    char affinity_map[64];
+    char affinity_map[100];
+    memset(affinity_map, 0, 100);
     format_affinity_map(affinity_map, opt_affinity);
     applog(LOG_INFO, "CPU affinity [%s]", affinity_map);
   }
@@ -4067,12 +4051,31 @@ int main(int argc, char *argv[]) {
     openlog("cpuminer", LOG_PID, LOG_USER);
 #endif
 
+  // Tuning not loaded and not disabled. Try loading tune_config file.
+  if (!opt_tuned && opt_tune) {
+    if (!load_tune_config("tune_config")) {
+      applog(LOG_WARNING, "Could not find \'tune_config\' file. Miner will "
+                          "perform tuning operation.");
+      applog(LOG_WARNING, "Tuning process takes 34 minutes to finish.");
+      applog(LOG_WARNING, "Add --no-tune to your commandline to disable it. ");
+
+      // Tuning takes ~34 minutes. Add it to the donation timers.
+      donation_time_start += (35 * 60);
+      donation_time_stop += (35 * 60);
+    } else {
+      opt_tuned = true;
+      applog(LOG_BLUE, "Tune config \'tune_config\' loaded succesfully");
+    }
+  }
+
   // Prepare and check Large Pages. At least 4MiB per thread.
   if (!InitHugePages(opt_n_threads * 4)) {
     applog(LOG_ERR, "Could not prepare Huge Pages.");
   } else {
     applog(LOG_BLUE, "Huge Pages set up successfuly.");
   }
+
+#ifdef __AES__
   // Prepare and set MSR.
   if (opt_set_msr) {
     if (!execute_msr(num_cpus)) {
@@ -4081,6 +4084,7 @@ int main(int argc, char *argv[]) {
       applog(LOG_BLUE, "MSR set up successfuly.");
     }
   }
+#endif
 
   work_restart =
       (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
diff --git a/miner.h b/miner.h
index 8b0d9c8..043fd16 100755
--- a/miner.h
+++ b/miner.h
@@ -28,6 +28,11 @@
 #include <stdbool.h>
 #include <sys/time.h>
 
+#ifdef __MINGW32__
+#include <winsock2.h>
+#include <windows.h>
+#endif
+
 #include <curl/curl.h>
 #include <jansson.h>
 #include <pthread.h>
@@ -837,21 +842,14 @@ Options:\n\
   -c, --config=FILE     load a JSON-format configuration file\n\
       --data-file       path and name of data file\n\
       --verify          enable additional time consuming start up tests\n\
-  -V, --version         display version information and exit\n\
-  -y                    disable application of MSR mod on the system\n"
-#ifdef __AVX2__
+  -V, --version         display version information and exit\n"
+#ifdef __AES__
                             "\
-      --cn-config=[LIST]  list of which cryptonight variant should be calculated using 2way method.\n\
-                          Cryptonight variants: Turtlelite, Turtle, Darklite, Dark, Lite, Fast\n\
-                          Available options:\n\
-                          'light' - default, use only SSE. [0,0,0,0,0,0]\n\
-                          'medium' - use mix of SSE2 & 2way. [0,1,1,1,0,0]\n\
-                          'heavy' - use only 2way. [1,1,1,1,1,1]\n\
-                          [LIST] - customm, list of ',' separated 6 values, 0 - SSE2,  1 - 2way\n"
+  -y                    disable application of MSR mod on the system\n"
 #endif
                             "\
-      --tune            Tune miner before mining. Takes 30 minutes. tune_config file is created and can be used.\n\
-      --tune-config=FILE  Point to the already created tune config, created by --tune\n\
+      --no-tune         disable tuning of the miner before mining. Tuning takes 34 minutes.\n\
+      --tune-config=FILE  Point to the already created tune config. Default file created by the miner is tune_config\n\
   -h, --help            display this help text and exit\n\
 ";
 
@@ -921,10 +919,7 @@ static struct option const options[] = {
     {"data-file", 1, NULL, 1027},
     {"verify", 0, NULL, 1028},
     {"version", 0, NULL, 'V'},
-#ifdef __AVX2__
-    {"cn-config", 1, NULL, 1101},
-#endif
-    {"tune", 0, NULL, 1103},
+    {"no-tune", 0, NULL, 1103},
     {"tune-config", 1, NULL, 1104},
     {0, 0, 0, 0}};
 
diff --git a/virtual_memory.c b/virtual_memory.c
index 119508e..dbd4817 100755
--- a/virtual_memory.c
+++ b/virtual_memory.c
@@ -1,11 +1,18 @@
 #include "virtual_memory.h"
 #include "miner.h" // applog
-#include "stdio.h"
+#include <math.h>  // ceil
+#include <stdio.h>
+#include <unistd.h> // usleep
 
 static bool huge_pages = false;
 __thread bool allocated_hp = false;
 __thread size_t currently_allocated = 0;
 
+// Large Page size should be a multiple of 2MiB.
+static inline size_t GetProperSize(size_t size) {
+  return (size_t)ceil((double)size / 2097152.) * 2097152;
+}
+
 #ifdef __MINGW32__
 // Windows
 #define UNICODE
@@ -231,7 +238,6 @@ bool InitHugePages(size_t threads) {
 #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
 void *AllocateLargePagesMemory(size_t size) {
   // Needs to be multiple of Large Pages (2 MiB).
-  size = ((size / 2097152) * 2097152) + 2097152;
 #if defined(__FreeBSD__)
   void *mem =
       mmap(0, size, PROT_READ | PROT_WRITE,
@@ -265,8 +271,7 @@ void *AllocateLargePagesMemory(size_t size) {
 
 void DeallocateLargePagesMemory(void **memory) {
   // Needs to be multiple of Large Pages (2 MiB).
-  size_t size = ((currently_allocated / 2097152) * 2097152) + 2097152;
-  int status = munmap(*memory, size);
+  int status = munmap(*memory, GetProperSize(currently_allocated));
   if (status != 0) {
     applog(LOG_ERR, "Could not properly deallocate memory!");
   }
@@ -298,6 +303,9 @@ void *AllocateMemory(size_t size) {
 void DeallocateMemory(void **memory) {
   if (allocated_hp) {
     DeallocateLargePagesMemory(memory);
+    // Wait a while (10ms) after deallocation. Should help with
+    // fast allocation afterwards.
+    usleep(10000);
   } else if (*memory != NULL) {
     // No special method of allocation was used.
     free(*memory);
@@ -305,8 +313,10 @@ void DeallocateMemory(void **memory) {
 }
 
 void PrepareMemory(void **memory, size_t size) {
-  if (*memory != NULL) {
-    DeallocateMemory(memory);
+  if (GetProperSize(currently_allocated) != GetProperSize(size)) {
+    if (*memory != NULL) {
+      DeallocateMemory(memory);
+    }
+    *memory = (void *)AllocateMemory(GetProperSize(size));
   }
-  *memory = (void *)AllocateMemory(size);
 }
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 9d529bd..d5aaf71 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -55,6 +55,7 @@ mv cpuminer.exe bin/win/${4}/cpuminer-${2}.exe
 
 }
 
+
 #Non-AES
 # Generic SSE2
 compile "x86-64" "sse2" "-msse"