Skip to content

Commit

Permalink
Replaced ASSERT function with a macro and enabled LTO.
Browse files Browse the repository at this point in the history
  • Loading branch information
tdulcet committed Apr 13, 2024
1 parent 45bbd6f commit 8628b31
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 33 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
sudo apt-get -yqq install libhwloc-dev
- name: Before script
run: |
sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
sed -i 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh
$CC --version
- name: Script
run: |
Expand Down Expand Up @@ -105,7 +105,7 @@ jobs:
sudo apt-get -yqq install libhwloc-dev
- name: Before script
run: |
sed -i 's/-O3/-Og -fsanitize=thread/' makemake.sh
sed -i 's/-O3 -flto/-Og -fsanitize=thread/' makemake.sh
$CC --version
- name: Script
run: |
Expand Down Expand Up @@ -240,7 +240,7 @@ jobs:
brew install hwloc
- name: Before script
run: |
sed -i '' 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
sed -i '' 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh
clang --version
- name: Script
run: |
Expand All @@ -266,7 +266,7 @@ jobs:
brew install hwloc
- name: Before script
run: |
sed -i '' 's/-O3/-Og -fsanitize=thread/' makemake.sh
sed -i '' 's/-O3 -flto/-Og -fsanitize=thread/' makemake.sh
clang --version
- name: Script
run: |
Expand Down Expand Up @@ -354,7 +354,7 @@ jobs:
- name: Before script
shell: bash
run: |
sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
sed -i 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh
$CC --version
- name: Script
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion makemake.sh
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ fi
# stack trace of the issue. If one wishes, one can run 'strip -g Mlucas' to remove the debugging symbols:
cat <<EOF >Makefile
CC ?= gcc
CFLAGS = -fdiagnostics-color -Wall -g -O3 # -flto=auto
CFLAGS = -fdiagnostics-color -Wall -g -O3 -flto #=auto
CPPFLAGS ?= -I/usr/local/include -I/opt/homebrew/include
LDFLAGS ?= -L/opt/homebrew/lib
LDLIBS = ${LD_ARGS[@]} # -static
Expand Down
44 changes: 22 additions & 22 deletions src/mi64.c
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"vmovd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"vmovd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"vmovdqa (%%rax),%%ymm0 \n\t"/* preload x[i0-(0:3)] */\
"loop_shl_short: \n\t"\
"loop_shl_short%=: \n\t"\
"vmovdqa -0x20(%%rax),%%ymm2 \n\t"/* load x[i0-(4:7)] */\
/* Starting with ymm0 = x[i0-(0:3)] and ymm2 = x[i0-(4:7)], need ymm1 = x[i0-(1:4)]: */\
"vpblendd $0xC0,%%ymm2,%%ymm0,%%ymm1 \n\t"/* ymm1 = x[i0-(4,1,2,3)] [no penalty for applying this dword-instruction to qword data.]
Expand All @@ -861,7 +861,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subq $0x40,%%rax \n\t"\
"subq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand All @@ -885,7 +885,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subl %[__i0],%%ecx \n\t"/* Skip the bottom (i0) elements */\
"vmovd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"vmovd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"loop_shl_short2: \n\t"\
"loop_shl_short2%=: \n\t"\
/* Replacing this sequence (and similarly in SHRL) with a preload-(0:3)/aligned-load-(4:7|8:b)/permute-to-get-(1:4|5:8) was slower (0.7 cycles/limb vs 0.95): */\
/* i0-(0:3): */\
"vmovdqu -0x08(%%rax),%%ymm1 \n\t"/* load x[i0-(1:4)] */\
Expand All @@ -905,7 +905,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subq $0x40,%%rax \n\t"\
"subq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shl_short2 \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shl_short2%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand Down Expand Up @@ -933,7 +933,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"movdqa (%%rax),%%xmm0 \n\t"/* preload x[i1-(0,1)] */\
"loop_shl_short: \n\t"\
"loop_shl_short%=: \n\t"\
/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
/* i1-(0,1): x[i1-(0,1)] in xmm0 */\
Expand Down Expand Up @@ -972,7 +972,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subq $0x40,%%rax \n\t"\
"subq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand All @@ -998,7 +998,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"movdqa (%%rax),%%xmm0 \n\t"/* preload x[i1-(0,1)] */\
"loop_shl_short2: \n\t"\
"loop_shl_short2%=: \n\t"\
/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
/* i1-(0,1): x[i1-(0,1)] in xmm0 */\
Expand Down Expand Up @@ -1037,7 +1037,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subq $0x40,%%rax \n\t"\
"subq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shl_short2 \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shl_short2%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand All @@ -1062,7 +1062,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subl %[__i0],%%ebx \n\t"/* Skip the bottom (i0+1) elements */\
"movslq %[__n],%%rcx \n\t"/* shift count */\
"movq (%%r10),%%rax \n\t"/* SHRD allows mem-ref only in DEST, so preload x[i0] */\
"loop_shl_short: \n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
"loop_shl_short%=: \n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
/* i-0: */\
"movq -0x08(%%r10),%%rsi \n\t"/* load x[i-1] ... the si in rsi stands for 'shift-in' :) */\
"shldq %%cl,%%rsi,%%rax \n\t"/* (x[i],x[i-1])<<n */\
Expand All @@ -1083,7 +1083,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"subq $0x20,%%r10 \n\t"\
"subq $0x20,%%r11 \n\t"\
"subq $4,%%rbx \n\t"\
"jnz loop_shl_short \n\t"/* loop end; continue is via jump-back if rbx != 0 */\
"jnz loop_shl_short%= \n\t"/* loop end; continue is via jump-back if rbx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand Down Expand Up @@ -1277,7 +1277,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"vmovd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"vmovd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"vmovdqa (%%rax),%%ymm0 \n\t"/* preload x[3-0] */\
"loop_shrl_short: \n\t"\
"loop_shrl_short%=: \n\t"\
/* i0-i3: */\
"vmovdqa 0x20(%%rax),%%ymm2 \n\t"/* load x[7-4] */\
"vpblendd $3,%%ymm2,%%ymm0,%%ymm1 \n\t"/* ymm1 = 3,2,1,4 [no penalty for applying this dword-instruction to qword data.] */\
Expand All @@ -1298,7 +1298,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"addq $0x40,%%rax \n\t"\
"addq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shrl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shrl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand All @@ -1325,7 +1325,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"addl %[__i1],%%ecx \n\t"/* ASM loop structured as for(j = i1; j != i0; j -= 8){...} */\
"vmovd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"vmovd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"loop_shrl_short: \n\t"\
"loop_shrl_short%=: \n\t"\
/* Replacing this sequence (and similarly in SHL) with the sequence
preload-(3:0);
aligned-load-(7:4|b:8);
Expand All @@ -1349,7 +1349,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"addq $0x40,%%rax \n\t"\
"addq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shrl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shrl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand Down Expand Up @@ -1383,7 +1383,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"movdqa (%%rax),%%xmm0 \n\t"/* preload x[1,0] */\
"loop_shrl_short: \n\t"\
"loop_shrl_short%=: \n\t"\
/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
/* i+0,1: x[1,0] in xmm0 */\
Expand Down Expand Up @@ -1422,7 +1422,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"addq $0x40,%%rax \n\t"\
"addq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shrl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shrl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand All @@ -1446,7 +1446,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"movdqa (%%rax),%%xmm0 \n\t"/* preload x[1,0] */\
"loop_shrl_short2: \n\t"\
"loop_shrl_short2%=: \n\t"\
/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
/* i+0,1: x[1,0] in xmm0 */\
Expand Down Expand Up @@ -1485,7 +1485,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"addq $0x40,%%rax \n\t"\
"addq $0x40,%%rbx \n\t"\
"subq $8,%%rcx \n\t"\
"jnz loop_shrl_short2 \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
"jnz loop_shrl_short2%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand Down Expand Up @@ -1516,7 +1516,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
"movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\
"movq (%%r10),%%rax \n\t"/* SHRD allows mem-ref only in DEST, so preload x[i+0] */\
"loop_shrl_short: \n\t"\
"loop_shrl_short%=: \n\t"\
"movdqa 0x20(%%r10),%%xmm0 \n\t"/* preload x[5,4] */\
"movq 0x20(%%r10),%%rdx \n\t"/* See "SSE2 write..." comment below for why need this */\
/* i+0: */ /* i+4,5: x[5,4] in xmm0 */\
Expand All @@ -1541,7 +1541,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movq (%%r10),%%rax \n\t"/* preload x[i+8] (already in xmm0[0:63], but there is no qword analog of MOVD),
at least none supported by my clang/gcc installs, due to 'movq' being assumed a 64-bit 'mov'. */\
"subq $8,%%rbx \n\t"\
"jnz loop_shrl_short \n\t"/* loop end; continue is via jump-back if rbx != 0 */\
"jnz loop_shrl_short%= \n\t"/* loop end; continue is via jump-back if rbx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand All @@ -1561,7 +1561,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"movslq %[__i1],%%rbx \n\t"/* ASM loop structured as for(j = i1; j != 0; j -= 4){...} */\
"movslq %[__n],%%rcx \n\t"/* shift count */\
"movq (%%r10),%%rax \n\t"/* SHRD allows mem-ref only in DEST, so preload x[0] */\
"loop_shrl_short: \n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
"loop_shrl_short%=: \n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
/* i+0: */\
"movq 0x08(%%r10),%%rsi \n\t"/* load x[i+1] ... the si in rsi stands for 'shift-in' :) */\
"shrdq %%cl,%%rsi,%%rax \n\t"/* (x[i+1],x[i])>>n */\
Expand All @@ -1582,7 +1582,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
"addq $0x20,%%r10 \n\t"\
"addq $0x20,%%r11 \n\t"\
"subq $4,%%rbx \n\t"\
"jnz loop_shrl_short \n\t"/* loop end; continue is via jump-back if rbx != 0 */\
"jnz loop_shrl_short%= \n\t"/* loop end; continue is via jump-back if rbx != 0 */\
: /* outputs: none */\
: [__x] "m" (x) /* All inputs from memory addresses here */\
,[__y] "m" (y) \
Expand Down
6 changes: 3 additions & 3 deletions src/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,16 @@ void WARN(long line, char*file, char*warn_string, char*warn_file, int copy2stder
#else

// void ASSERT(char*func, long line, char*file, int expr, char*assert_string) {
void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string) {
__attribute__ ((__noreturn__)) void ABORT(const char*assertion, const char*file, long line, const char*func, const char*assert_string) {
/* Define a convenient spot to set a breakpoint: */
if(!expr) {
// if(!expr) {
fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file); fprintf(stderr,"Assertion '%s' failed: %s\n", assertion, assert_string);
/* Flush all output streams prior to asserting. We replace the original assert(0) call with
an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
fflush(NULL);
// exit(EXIT_FAILURE); // Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing
abort();
}
// }
}

#endif // __CUDA_ARCH__ ?
Expand Down
4 changes: 2 additions & 2 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,10 @@ void WARN (long line, char*file, char*warn_string, char*warn_file, int copy2stde
void ASSERT(long line, char*file, int expr, char*assert_string);
#else
// void ASSERT (long line, char*file, int expr, char*assert_string);
void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string);
__attribute__ ((__noreturn__)) void ABORT(const char*assertion, const char*file, long line, const char*func, const char*assert_string);
#endif

#define ASSERT(expr, assert_string) _ASSERT(#expr, __FILE__, __LINE__, __func__, (expr), assert_string)
#define ASSERT(expr, assert_string) (void)((expr) || (ABORT(#expr, __FILE__, __LINE__, __func__, assert_string),0))

void VAR_WARN(char *typelist, ...);

Expand Down

0 comments on commit 8628b31

Please sign in to comment.