Skip to content

Commit

Permalink
Merge pull request #39 from okuhara/neon_vaddvq
Browse files Browse the repository at this point in the history
Resurrect neon vaddvq version of count_last_flip
  • Loading branch information
abulmo authored Dec 7, 2024
2 parents dbcac52 + 9635daf commit 8dbf3b6
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
cd src
${{ matrix.build_command }}
- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v4
with:
name: artifact_${{ runner.os }}
path: bin
8 changes: 4 additions & 4 deletions src/board.c
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ static uint64x2_t transpose_neon(const uint64x2_t board)
*/
void board_horizontal_mirror(const Board *board, Board *sym)
{
#if defined(__ARM_NEON)
#if USE_SIMD & defined(__ARM_NEON)
vst1q_u64((uint64_t *) sym, horizontal_mirror_neon(vld1q_u64((uint64_t *) board)));
#else
sym->player = horizontal_mirror(board->player);
Expand All @@ -428,7 +428,7 @@ void board_horizontal_mirror(const Board *board, Board *sym)
*/
void board_vertical_mirror(const Board *board, Board *sym)
{
#if defined(__ARM_NEON)
#if USE_SIMD & defined(__ARM_NEON)
vst1q_u64((uint64_t *) sym, vertical_mirror_neon(vld1q_u64((uint64_t *) board)));
#else
sym->player = vertical_mirror(board->player);
Expand Down Expand Up @@ -716,7 +716,7 @@ static inline uint64_t get_some_moves(const uint64_t P, const uint64_t mask, con
const uint64x2_t PP = vdupq_n_u64(P);
const uint64x2_t MM = vdupq_n_u64(mask);
const uint64x2_t moves = get_some_moves_neon(PP, MM, dir);
return vget_low_u64(moves) | vget_high_u64(moves);
return vgetq_lane_u64(moves, 0) | vgetq_lane_u64(moves, 1);

#elif PARALLEL_PREFIX & 1

Expand Down Expand Up @@ -831,7 +831,7 @@ uint64_t get_moves(const uint64_t P, const uint64_t O)
moves = vorrq_u64(moves, get_some_moves_neon(PP, OO, 8)); // vertical
moves = vorrq_u64(moves, get_some_moves_neon(PP, MM, 7)); // diagonals
moves = vorrq_u64(moves, get_some_moves_neon(PP, MM, 9));
return (vget_low_u64(moves) | vget_high_u64(moves)) & E; // mask with empties
return (vgetq_lane_u64(moves, 0) | vgetq_lane_u64(moves, 1)) & E; // mask with empties

#else

Expand Down
24 changes: 19 additions & 5 deletions src/count_last_flip_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,18 @@
* For optimization purpose, the value returned is twice the number of flipped
* disc, to facilitate the computation of disc difference.
*
* @date 1998 - 2023
* @date 1998 - 2024
* @author Richard Delorme
* @author Toshihiko Okuhara
* @version 4.5
* @version 4.6
*
*/

#include <arm_neon.h>
#include <stdio.h>

#define COUNT_LAST_FLIP_NEON_VADDVQ

/** precomputed count flip array */
const unsigned char COUNT_FLIP[8][256] = {
{
Expand Down Expand Up @@ -111,7 +113,7 @@ const unsigned char COUNT_FLIP[8][256] = {
},
};

#ifdef HAS_CPU_64
#ifdef COUNT_LAST_FLIP_NEON_VADDVQ
/* bit masks for diagonal lines (interleaved) */
const uint64x2_t mask_dvhd[64][2] = {
{{ 0x000000000000ff01, 0x0000000000000000 }, { 0x0801040102010101, 0x8001400120011001 }},
Expand Down Expand Up @@ -264,9 +266,21 @@ int count_last_flip(int pos, uint64_t P)
const unsigned char *COUNT_FLIP_Y = COUNT_FLIP[pos >> 3];
uint64x2_t PP = vdupq_n_u64(P);
uint64x2_t II;
#ifdef COUNT_LAST_FLIP_NEON_VADDVQ // vaddvq
unsigned int t;
const uint64x2_t dmask = { 0x0808040402020101, 0x8080404020201010 };

// removed the buggy vaddvq code
PP = vreinterpretq_u64_u8(vzip1q_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(PP)));
II = vandq_u64(PP, mask_dvhd[pos][0]); // 2 dirs interleaved
t = vaddvq_u16(vreinterpretq_u16_u64(II));
n_flips = COUNT_FLIP_X[t >> 8];
n_flips += COUNT_FLIP_X[t & 0xFF];
II = vandq_u64(vreinterpretq_u64_u8(vtstq_u8(vreinterpretq_u8_u64(PP), vreinterpretq_u8_u64(mask_dvhd[pos][1]))), dmask);
t = vaddvq_u16(vreinterpretq_u16_u64(II));
n_flips += COUNT_FLIP_Y[t >> 8];
n_flips += COUNT_FLIP_Y[t & 0xFF];

#else // Neon kindergarten
const uint64x2_t dmask = { 0x1020408001020408, 0x1020408001020408 };
uint64x2_t PP = vdupq_n_u64(P);
n_flips = 0;
Expand All @@ -277,6 +291,6 @@ int count_last_flip(int pos, uint64_t P)
II = vpaddlq_u32(vmulq_u32(vreinterpretq_u32_u64(dmask), vreinterpretq_u32_u64(II)));
n_flips += COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 11)];
n_flips += COUNT_FLIP_Y[vgetq_lane_u8(vreinterpretq_u8_u64(II), 3)];

#endif
return n_flips;
}

0 comments on commit 8dbf3b6

Please sign in to comment.