From 8628b3181a77f6a3d8a75893b50f282038afa84a Mon Sep 17 00:00:00 2001
From: Teal Dulcet <tdulcet@pdx.edu>
Date: Sat, 13 Apr 2024 08:50:57 -0700
Subject: [PATCH] Replaced ASSERT function with a macro and enabled LTO.

---
 .github/workflows/ci.yml | 10 ++++-----
 makemake.sh              |  2 +-
 src/mi64.c               | 44 ++++++++++++++++++++--------------------
 src/util.c               |  6 +++---
 src/util.h               |  4 ++--
 5 files changed, 33 insertions(+), 33 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fa74f0f3..1cc48440 100755
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -73,7 +73,7 @@ jobs:
         sudo apt-get -yqq install libhwloc-dev
     - name: Before script
       run: |
-        sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
+        sed -i 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh
         $CC --version
     - name: Script
       run: |
@@ -105,7 +105,7 @@ jobs:
         sudo apt-get -yqq install libhwloc-dev
     - name: Before script
       run: |
-        sed -i 's/-O3/-Og -fsanitize=thread/' makemake.sh
+        sed -i 's/-O3 -flto/-Og -fsanitize=thread/' makemake.sh
         $CC --version
     - name: Script
       run: |
@@ -240,7 +240,7 @@ jobs:
         brew install hwloc
     - name: Before script
       run: |
-        sed -i '' 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
+        sed -i '' 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh
         clang --version
     - name: Script
       run: |
@@ -266,7 +266,7 @@ jobs:
         brew install hwloc
     - name: Before script
       run: |
-        sed -i '' 's/-O3/-Og -fsanitize=thread/' makemake.sh
+        sed -i '' 's/-O3 -flto/-Og -fsanitize=thread/' makemake.sh
         clang --version
     - name: Script
       run: |
@@ -354,7 +354,7 @@ jobs:
     - name: Before script
       shell: bash
       run: |
-        sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
+        sed -i 's/-O3 -flto/-Og -fsanitize=address,undefined/' makemake.sh
         $CC --version
     - name: Script
       shell: bash
diff --git a/makemake.sh b/makemake.sh
index 03ce283b..6d8a2a10 100644
--- a/makemake.sh
+++ b/makemake.sh
@@ -370,7 +370,7 @@ fi
 # stack trace of the issue. If one wishes, one can run 'strip -g Mlucas' to remove the debugging symbols:
 cat <<EOF >Makefile
 CC ?= gcc
-CFLAGS = -fdiagnostics-color -Wall -g -O3 # -flto=auto
+CFLAGS = -fdiagnostics-color -Wall -g -O3 -flto #=auto
 CPPFLAGS ?= -I/usr/local/include -I/opt/homebrew/include
 LDFLAGS ?= -L/opt/homebrew/lib
 LDLIBS = ${LD_ARGS[@]} # -static
diff --git a/src/mi64.c b/src/mi64.c
index de15d35d..4ba9a6f3 100755
--- a/src/mi64.c
+++ b/src/mi64.c
@@ -838,7 +838,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"vmovd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"vmovd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"vmovdqa	     (%%rax),%%ymm0	\n\t"/* preload x[i0-(0:3)] */\
-	"loop_shl_short:		\n\t"\
+	"loop_shl_short%=:		\n\t"\
 		"vmovdqa	-0x20(%%rax),%%ymm2	\n\t"/* load x[i0-(4:7)] */\
 		/* Starting with ymm0 = x[i0-(0:3)] and ymm2 = x[i0-(4:7)], need ymm1 = x[i0-(1:4)]: */\
 		"vpblendd $0xC0,%%ymm2,%%ymm0,%%ymm1	\n\t"/* ymm1 = x[i0-(4,1,2,3)] [no penalty for applying this dword-instruction to qword data.]
@@ -861,7 +861,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subq	$0x40,%%rax			\n\t"\
 		"subq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shl_short		\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shl_short%=		\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -885,7 +885,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subl	%[__i0],%%ecx		\n\t"/* Skip the bottom (i0) elements */\
 		"vmovd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"vmovd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
-	"loop_shl_short2:		\n\t"\
+	"loop_shl_short2%=:		\n\t"\
 	/* Replacing this sequence (and similarly in SHRL) with a preload-(0:3)/aligned-load-(4:7|8:b)/permute-to-get-(1:4|5:8) was slower (0.7 cycles/limb vs 0.95): */\
 		/* i0-(0:3): */\
 		"vmovdqu	-0x08(%%rax),%%ymm1	\n\t"/* load x[i0-(1:4)] */\
@@ -905,7 +905,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subq	$0x40,%%rax			\n\t"\
 		"subq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shl_short2		\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shl_short2%=		\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -933,7 +933,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"movd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"movdqa	     (%%rax),%%xmm0	\n\t"/* preload x[i1-(0,1)] */\
-	"loop_shl_short:		\n\t"\
+	"loop_shl_short%=:		\n\t"\
 	/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
 	shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
 		/* i1-(0,1): x[i1-(0,1)] in xmm0 */\
@@ -972,7 +972,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subq	$0x40,%%rax			\n\t"\
 		"subq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shl_short	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shl_short%=	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -998,7 +998,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"movd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"movdqa	     (%%rax),%%xmm0	\n\t"/* preload x[i1-(0,1)] */\
-	"loop_shl_short2:		\n\t"\
+	"loop_shl_short2%=:		\n\t"\
 	/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
 	shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
 		/* i1-(0,1): x[i1-(0,1)] in xmm0 */\
@@ -1037,7 +1037,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subq	$0x40,%%rax			\n\t"\
 		"subq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shl_short2	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shl_short2%=	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1062,7 +1062,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subl	%[__i0],%%ebx		\n\t"/* Skip the bottom (i0+1) elements */\
 		"movslq	%[__n],%%rcx		\n\t"/* shift count */\
 		"movq	(%%r10),%%rax		\n\t"/* SHRD allows mem-ref only in DEST, so preload x[i0] */\
-	"loop_shl_short:	\n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
+	"loop_shl_short%=:	\n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
 		/* i-0: */\
 		"movq	-0x08(%%r10),%%rsi	\n\t"/* load x[i-1] ... the si in rsi stands for 'shift-in' :) */\
 		"shldq	%%cl,%%rsi,%%rax	\n\t"/* (x[i],x[i-1])<<n */\
@@ -1083,7 +1083,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"subq	$0x20,%%r10			\n\t"\
 		"subq	$0x20,%%r11			\n\t"\
 	"subq	$4,%%rbx		\n\t"\
-	"jnz loop_shl_short	\n\t"/* loop end; continue is via jump-back if rbx != 0 */\
+	"jnz loop_shl_short%=	\n\t"/* loop end; continue is via jump-back if rbx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1277,7 +1277,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"vmovd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"vmovd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"vmovdqa		(%%rax),%%ymm0	\n\t"/* preload x[3-0]  */\
-	"loop_shrl_short:		\n\t"\
+	"loop_shrl_short%=:		\n\t"\
 		/* i0-i3: */\
 		"vmovdqa	0x20(%%rax),%%ymm2	\n\t"/* load x[7-4]  */\
 		"vpblendd $3,%%ymm2,%%ymm0,%%ymm1	\n\t"/* ymm1 = 3,2,1,4 [no penalty for applying this dword-instruction to qword data.] */\
@@ -1298,7 +1298,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"addq	$0x40,%%rax			\n\t"\
 		"addq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shrl_short	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shrl_short%=	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1325,7 +1325,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"addl	%[__i1],%%ecx		\n\t"/* ASM loop structured as for(j = i1; j != i0; j -= 8){...} */\
 		"vmovd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"vmovd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
-	"loop_shrl_short:		\n\t"\
+	"loop_shrl_short%=:		\n\t"\
 	/* Replacing this sequence (and similarly in SHL) with the sequence
 		preload-(3:0);
 		aligned-load-(7:4|b:8);
@@ -1349,7 +1349,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"addq	$0x40,%%rax			\n\t"\
 		"addq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shrl_short	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shrl_short%=	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1383,7 +1383,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"movd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"movdqa		(%%rax),%%xmm0	\n\t"/* preload x[1,0] */\
-	"loop_shrl_short:		\n\t"\
+	"loop_shrl_short%=:		\n\t"\
 	/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
 	shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
 		/* i+0,1: x[1,0] in xmm0 */\
@@ -1422,7 +1422,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"addq	$0x40,%%rax			\n\t"\
 		"addq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shrl_short	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shrl_short%=	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1446,7 +1446,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"movd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"movdqa		(%%rax),%%xmm0	\n\t"/* preload x[1,0] */\
-	"loop_shrl_short2:		\n\t"\
+	"loop_shrl_short2%=:		\n\t"\
 	/* 1st version did 2 MOVDQU-load per double-qword output; current version does just 1 MOVDQU, instead uses
 	shuffles to generate the 1-qword-staggered shift-in-data xmm-register operand, cuts cycles by 15% on Core2. */\
 		/* i+0,1: x[1,0] in xmm0 */\
@@ -1485,7 +1485,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"addq	$0x40,%%rax			\n\t"\
 		"addq	$0x40,%%rbx			\n\t"\
 	"subq	$8,%%rcx		\n\t"\
-	"jnz loop_shrl_short2	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
+	"jnz loop_shrl_short2%=	\n\t"/* loop end; continue is via jump-back if rcx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1516,7 +1516,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movd	%[__n] ,%%xmm14		\n\t"/* shift count - since imm-operands only take compile-time consts, this costs a vector register */\
 		"movd	%[__nc],%%xmm15		\n\t"/* complement-shift count, 64-n */\
 		"movq	    (%%r10),%%rax	\n\t"/* SHRD allows mem-ref only in DEST, so preload x[i+0] */\
-	"loop_shrl_short:		\n\t"\
+	"loop_shrl_short%=:		\n\t"\
 		"movdqa	0x20(%%r10),%%xmm0	\n\t"/* preload x[5,4] */\
 		"movq	0x20(%%r10),%%rdx	\n\t"/* See "SSE2 write..." comment below for why need this */\
 		/* i+0: */							/* i+4,5: x[5,4] in xmm0       */\
@@ -1541,7 +1541,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movq	    (%%r10),%%rax	\n\t"/* preload x[i+8] (already in xmm0[0:63], but there is no qword analog of MOVD),
 							at least none supported by my clang/gcc installs, due to 'movq' being assumed a 64-bit 'mov'. */\
 	"subq	$8,%%rbx		\n\t"\
-	"jnz loop_shrl_short	\n\t"/* loop end; continue is via jump-back if rbx != 0 */\
+	"jnz loop_shrl_short%=	\n\t"/* loop end; continue is via jump-back if rbx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
@@ -1561,7 +1561,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"movslq	%[__i1],%%rbx		\n\t"/* ASM loop structured as for(j = i1; j != 0; j -= 4){...} */\
 		"movslq	%[__n],%%rcx		\n\t"/* shift count */\
 		"movq	(%%r10),%%rax		\n\t"/* SHRD allows mem-ref only in DEST, so preload x[0] */\
-	"loop_shrl_short:	\n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
+	"loop_shrl_short%=:	\n\t"/* Since this non-SIMD asm-code may be active along with the SIMD, append '2' to the label */\
 		/* i+0: */\
 		"movq	0x08(%%r10),%%rsi	\n\t"/* load x[i+1] ... the si in rsi stands for 'shift-in' :) */\
 		"shrdq	%%cl,%%rsi,%%rax	\n\t"/* (x[i+1],x[i])>>n */\
@@ -1582,7 +1582,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 		"addq	$0x20,%%r10			\n\t"\
 		"addq	$0x20,%%r11			\n\t"\
 	"subq	$4,%%rbx		\n\t"\
-	"jnz loop_shrl_short	\n\t"/* loop end; continue is via jump-back if rbx != 0 */\
+	"jnz loop_shrl_short%=	\n\t"/* loop end; continue is via jump-back if rbx != 0 */\
 		:	/* outputs: none */\
 		: [__x] "m" (x)	/* All inputs from memory addresses here */\
 		 ,[__y] "m" (y)	\
diff --git a/src/util.c b/src/util.c
index dbf189b1..72d5e61a 100644
--- a/src/util.c
+++ b/src/util.c
@@ -80,16 +80,16 @@ void WARN(long line, char*file, char*warn_string, char*warn_file, int copy2stder
 #else
 
 	// void ASSERT(char*func, long line, char*file, int expr, char*assert_string) {
-	void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string) {
+	__attribute__ ((__noreturn__)) void ABORT(const char*assertion, const char*file, long line, const char*func, const char*assert_string) {
 		/* Define a convenient spot to set a breakpoint: */
-		if(!expr) {
+		// if(!expr) {
 			fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file);	fprintf(stderr,"Assertion '%s' failed: %s\n", assertion, assert_string);
 			/* Flush all output streams prior to asserting. We replace the original assert(0) call with
 			an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
 			fflush(NULL);
 			// exit(EXIT_FAILURE);	// Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing
 			abort();
-		}
+		// }
 	}
 
 #endif	// __CUDA_ARCH__ ?
diff --git a/src/util.h b/src/util.h
index 620070ab..afd55c69 100755
--- a/src/util.h
+++ b/src/util.h
@@ -230,10 +230,10 @@ void	WARN	(long line, char*file, char*warn_string, char*warn_file, int copy2stde
 	void	ASSERT(long line, char*file, int expr, char*assert_string);
 #else
 	// void	ASSERT	(long line, char*file, int expr, char*assert_string);
-	void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string);
+	__attribute__ ((__noreturn__)) void ABORT(const char*assertion, const char*file, long line, const char*func, const char*assert_string);
 #endif
 
-#define ASSERT(expr, assert_string) _ASSERT(#expr, __FILE__, __LINE__, __func__, (expr), assert_string)
+#define ASSERT(expr, assert_string) (void)((expr) || (ABORT(#expr, __FILE__, __LINE__, __func__, assert_string),0))
 
 void	VAR_WARN(char *typelist, ...);