From 8628b3181a77f6a3d8a75893b50f282038afa84a Mon Sep 17 00:00:00 2001 From: Teal Dulcet Date: Sat, 13 Apr 2024 08:50:57 -0700 Subject: [PATCH] Replaced ASSERT function with a macro and enabled LTO. --- .github/workflows/ci.yml | 10 ++++----- makemake.sh | 2 +- src/mi64.c | 44 ++++++++++++++++++++-------------------- src/util.c | 6 +++--- src/util.h | 4 ++-- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fa74f0f3..1cc48440 100755 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -73,7 +73,7 @@ jobs: sudo apt-get -yqq install libhwloc-dev - name: Before script run: | - sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh + sed -i 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh $CC --version - name: Script run: | @@ -105,7 +105,7 @@ jobs: sudo apt-get -yqq install libhwloc-dev - name: Before script run: | - sed -i 's/-O3/-Og -fsanitize=thread/' makemake.sh + sed -i 's/-O3 -flto/-Og -fsanitize=thread/' makemake.sh $CC --version - name: Script run: | @@ -240,7 +240,7 @@ jobs: brew install hwloc - name: Before script run: | - sed -i '' 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh + sed -i '' 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh clang --version - name: Script run: | @@ -266,7 +266,7 @@ jobs: brew install hwloc - name: Before script run: | - sed -i '' 's/-O3/-Og -fsanitize=thread/' makemake.sh + sed -i '' 's/-O3 -flto/-Og -fsanitize=thread/' makemake.sh clang --version - name: Script run: | @@ -354,7 +354,7 @@ jobs: - name: Before script shell: bash run: | - sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh + sed -i 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh $CC --version - name: Script shell: bash diff --git a/makemake.sh b/makemake.sh index 03ce283b..6d8a2a10 100644 --- a/makemake.sh +++ b/makemake.sh @@ -370,7 +370,7 @@ fi # stack trace of the issue. If one wishes, one can run 'strip -g Mlucas' to remove the debugging symbols: cat <Makefile CC ?= gcc -CFLAGS = -fdiagnostics-color -Wall -g -O3 # -flto=auto +CFLAGS = -fdiagnostics-color -Wall -g -O3 -flto #=auto CPPFLAGS ?= -I/usr/local/include -I/opt/homebrew/include LDFLAGS ?= -L/opt/homebrew/lib LDLIBS = ${LD_ARGS[@]} # -static diff --git a/src/mi64.c b/src/mi64.c index de15d35d..4ba9a6f3 100755 --- a/src/mi64.c +++ b/src/mi64.c @@ -838,7 +838,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "vmovd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\ "vmovd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\ "vmovdqa (%%rax),%%ymm0 \n\t"/* preload x[i0-(0:3)] */\ - "loop_shl_short: \n\t"\ + "loop_shl_short%=: \n\t"\ "vmovdqa -0x20(%%rax),%%ymm2 \n\t"/* load x[i0-(4:7)] */\ /* Starting with ymm0 = x[i0-(0:3)] and ymm2 = x[i0-(4:7)], need ymm1 = x[i0-(1:4)]: */\ "vpblendd $0xC0,%%ymm2,%%ymm0,%%ymm1 \n\t"/* ymm1 = x[i0-(4,1,2,3)] [no penalty for applying this dword-instruction to qword data.] @@ -861,7 +861,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "subq $0x40,%%rax \n\t"\ "subq $0x40,%%rbx \n\t"\ "subq $8,%%rcx \n\t"\ - "jnz loop_shl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ + "jnz loop_shl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ : /* outputs: none */\ : [__x] "m" (x) /* All inputs from memory addresses here */\ ,[__y] "m" (y) \ @@ -885,7 +885,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "subl %[__i0],%%ecx \n\t"/* Skip the bottom (i0) elements */\ "vmovd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\ "vmovd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\ - "loop_shl_short2: \n\t"\ + "loop_shl_short2%=: \n\t"\ /* Replacing this sequence (and similarly in SHRL) with a preload-(0:3)/aligned-load-(4:7|8:b)/permute-to-get-(1:4|5:8) was slower (0.7 cycles/limb vs 0.95): */\ /* i0-(0:3): */\ "vmovdqu -0x08(%%rax),%%ymm1 \n\t"/* load x[i0-(1:4)] */\ @@ -905,7 +905,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "subq $0x40,%%rax \n\t"\ "subq $0x40,%%rbx \n\t"\ "subq $8,%%rcx \n\t"\ - "jnz loop_shl_short2 \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ + "jnz loop_shl_short2%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ : /* outputs: none */\ : [__x] "m" (x) /* All inputs from memory addresses here */\ ,[__y] "m" (y) \ @@ -933,7 +933,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\ "movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\ "movdqa (%%rax),%%xmm0 \n\t"/* preload x[i1-(0,1)] */\ - "loop_shl_short: \n\t"\ + "loop_shl_short%=: \n\t"\ /* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\ /* i1-(0,1): x[i1-(0,1)] in xmm0 */\ @@ -972,7 +972,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "subq $0x40,%%rax \n\t"\ "subq $0x40,%%rbx \n\t"\ "subq $8,%%rcx \n\t"\ - "jnz loop_shl_short \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ + "jnz loop_shl_short%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ : /* outputs: none */\ : [__x] "m" (x) /* All inputs from memory addresses here */\ ,[__y] "m" (y) \ @@ -998,7 +998,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "movd %[__n] ,%%xmm14 \n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\ "movd %[__nc],%%xmm15 \n\t"/* complement-shift count, 64-n */\ "movdqa (%%rax),%%xmm0 \n\t"/* preload x[i1-(0,1)] */\ - "loop_shl_short2: \n\t"\ + "loop_shl_short2%=: \n\t"\ /* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\ /* i1-(0,1): x[i1-(0,1)] in xmm0 */\ @@ -1037,7 +1037,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "subq $0x40,%%rax \n\t"\ "subq $0x40,%%rbx \n\t"\ "subq $8,%%rcx \n\t"\ - "jnz loop_shl_short2 \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ + "jnz loop_shl_short2%= \n\t"/* loop end; continue is via jump-back if rcx != 0 */\ : /* outputs: none */\ : [__x] "m" (x) /* All inputs from memory addresses here */\ ,[__y] "m" (y) \ @@ -1062,7 +1062,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "subl %[__i0],%%ebx \n\t"/* Skip the bottom (i0+1) elements */\ "movslq %[__n],%%rcx \n\t"/* shift count */\ "movq (%%r10),%%rax \n\t"/* SHRD allows mem-ref only in DEST, so preload x[i0] */\ - "loop_shl_short: \n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\ + "loop_shl_short%=: \n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\ /* i-0: */\ "movq -0x08(%%r10),%%rsi \n\t"/* load x[i-1] ... the si in rsi stands for 'shift-in' :) */\ "shldq %%cl,%%rsi,%%rax \n\t"/* (x[i],x[i-1])<>n */\ @@ -1582,7 +1582,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) "addq $0x20,%%r10 \n\t"\ "addq $0x20,%%r11 \n\t"\ "subq $4,%%rbx \n\t"\ - "jnz loop_shrl_short \n\t"/* loop end; continue is via jump-back if rbx != 0 */\ + "jnz loop_shrl_short%= \n\t"/* loop end; continue is via jump-back if rbx != 0 */\ : /* outputs: none */\ : [__x] "m" (x) /* All inputs from memory addresses here */\ ,[__y] "m" (y) \ diff --git a/src/util.c b/src/util.c index dbf189b1..72d5e61a 100644 --- a/src/util.c +++ b/src/util.c @@ -80,16 +80,16 @@ void WARN(long line, char*file, char*warn_string, char*warn_file, int copy2stder #else // void ASSERT(char*func, long line, char*file, int expr, char*assert_string) { - void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string) { + __attribute__ ((__noreturn__)) void ABORT(const char*assertion, const char*file, long line, const char*func, const char*assert_string) { /* Define a convenient spot to set a breakpoint: */ - if(!expr) { + // if(!expr) { fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file); fprintf(stderr,"Assertion '%s' failed: %s\n", assertion, assert_string); /* Flush all output streams prior to asserting. We replace the original assert(0) call with an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */ fflush(NULL); // exit(EXIT_FAILURE); // Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing abort(); - } + // } } #endif // __CUDA_ARCH__ ? diff --git a/src/util.h b/src/util.h index 620070ab..afd55c69 100755 --- a/src/util.h +++ b/src/util.h @@ -230,10 +230,10 @@ void WARN (long line, char*file, char*warn_string, char*warn_file, int copy2stde void ASSERT(long line, char*file, int expr, char*assert_string); #else // void ASSERT (long line, char*file, int expr, char*assert_string); - void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string); + __attribute__ ((__noreturn__)) void ABORT(const char*assertion, const char*file, long line, const char*func, const char*assert_string); #endif -#define ASSERT(expr, assert_string) _ASSERT(#expr, __FILE__, __LINE__, __func__, (expr), assert_string) +#define ASSERT(expr, assert_string) (void)((expr) || (ABORT(#expr, __FILE__, __LINE__, __func__, assert_string),0)) void VAR_WARN(char *typelist, ...);