diff --git a/src/Mdata.h b/src/Mdata.h
index 7c46ef3b..19e913e7 100755
--- a/src/Mdata.h
+++ b/src/Mdata.h
@@ -222,7 +222,7 @@ extern char PSTRING[STR_MAX_LEN];	/* Number being tested in string form, typical
 #endif
 
 extern const int hex_chars[16];
-extern char cbuf[STR_MAX_LEN], cstr[STR_MAX_LEN];
+extern char cbuf[STR_MAX_LEN*2], cstr[STR_MAX_LEN];
 extern char in_line[STR_MAX_LEN];
 extern char *char_addr;
 extern int char_offset;
diff --git a/src/Mlucas.c b/src/Mlucas.c
index ea0746c0..2969ef33 100644
--- a/src/Mlucas.c
+++ b/src/Mlucas.c
@@ -111,7 +111,7 @@ char PSTRING[STR_MAX_LEN];	// Modulus being used in string form, e.g. "M11091692
 #endif
 
 const int hex_chars[16] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'};
-char cbuf[STR_MAX_LEN],cstr[STR_MAX_LEN];
+char cbuf[STR_MAX_LEN*2],cstr[STR_MAX_LEN];
 char in_line[STR_MAX_LEN];
 char *char_addr;
 
@@ -481,15 +481,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
   #ifdef USE_OMP
 	// OpenMP not currently supported (attempting to build with this #define enabled barfs in
 	// preprocessing via #error in platform.h), this is merely placeholder for possible future use:
-	ASSERT(HERE, MAX_THREADS = omp_get_num_procs(), "Illegal #Cores value stored in MAX_THREADS");
+	ASSERT(MAX_THREADS = omp_get_num_procs(), "Illegal #Cores value stored in MAX_THREADS");
   #elif(defined(USE_PTHREAD))
-	ASSERT(HERE, MAX_THREADS =     get_num_cores(), "Illegal #Cores value stored in MAX_THREADS");
+	ASSERT(MAX_THREADS =     get_num_cores(), "Illegal #Cores value stored in MAX_THREADS");
   #else
 	#error Unrecognized multithreading model!
   #endif
 	// MAX_THREADS based on number of processing cores will most often be a power of 2, but don't assume that.
-	ASSERT(HERE, MAX_THREADS > 0,"MAX_THREADS must be > 0");
-	ASSERT(HERE, MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h .");
+	ASSERT(MAX_THREADS > 0,"MAX_THREADS must be > 0");
+	ASSERT(MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h .");
 
 	if(!NTHREADS) {
 		NTHREADS = 1;
@@ -499,7 +499,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		parseAffinityString(cbuf);
 	} else if(NTHREADS > MAX_CORES) {
 		sprintf(cbuf,"ERROR: NTHREADS = %d exceeds the MAX_CORES setting in Mdata.h = %d\n", NTHREADS, MAX_CORES);
-		ASSERT(HERE, 0, cbuf);
+		ASSERT(0, cbuf);
 	} else {	// In timing-test mode, allow #threads > #cores
 		if(NTHREADS > MAX_THREADS) {
 			fprintf(stderr,"WARN: NTHREADS = %d exceeds number of cores = %d\n", NTHREADS, MAX_THREADS);
@@ -524,7 +524,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		else
 			ITERS_BETWEEN_CHECKPOINTS =  10000;
 	} else if(check_interval < 1000) {
-		ASSERT(HERE,0,"User-set value of check_interval must >= 1000.");
+		ASSERT(0,"User-set value of check_interval must >= 1000.");
 	} else
 		ITERS_BETWEEN_CHECKPOINTS = check_interval;
 
@@ -532,7 +532,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 
 	i = ITERS_BETWEEN_GCHECKS;
 	j = ITERS_BETWEEN_GCHECK_UPDATES;
-	ASSERT(HERE, i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)");
+	ASSERT(i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)");
 	// v19: If PRP test, make sure Gerbicz-checkproduct interval divides checkpoint-writing one.
 	// If not true, merely warn here because user may be doing LL/DC/p-1 and not PRP-tests:
 	k = ITERS_BETWEEN_CHECKPOINTS;
@@ -542,8 +542,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	// Alloc bitwise multiply-by-base array, needed to support P-1 factoring and PRP testing:
 	if(!BASE_MULTIPLIER_BITS) {
 		j = ((ITERS_BETWEEN_CHECKPOINTS+63) >> 6) + 1;	// Add 1 pad element in case compiler does not 64-bit align
-		BASE_MULTIPLIER_BITS = ALLOC_UINT64(BASE_MULTIPLIER_BITS, j);	if(!BASE_MULTIPLIER_BITS){ sprintf(cbuf, "ERROR: unable to allocate BASE_MULTIPLIER_BITS array in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		BASE_MULTIPLIER_BITS = ALIGN_UINT64(BASE_MULTIPLIER_BITS);	ASSERT(HERE, ((intptr_t)BASE_MULTIPLIER_BITS & 63) == 0x0,"BASE_MULTIPLIER_BITS[] not aligned on 64-byte boundary!");
+		BASE_MULTIPLIER_BITS = ALLOC_UINT64(BASE_MULTIPLIER_BITS, j);	if(!BASE_MULTIPLIER_BITS){ sprintf(cbuf, "ERROR: unable to allocate BASE_MULTIPLIER_BITS array in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		BASE_MULTIPLIER_BITS = ALIGN_UINT64(BASE_MULTIPLIER_BITS);	ASSERT(((intptr_t)BASE_MULTIPLIER_BITS & 63) == 0x0,"BASE_MULTIPLIER_BITS[] not aligned on 64-byte boundary!");
 		for(i = 0; i < j; i++) { BASE_MULTIPLIER_BITS[i] = 0ull; }	// v20: Init = 0 here, in case we jump directly into p-1 stage 2 on restart
 	}
 
@@ -648,17 +648,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			char_addr += 3;
 			// Check [k,b,n,c] portion of in_line:
 			cptr = check_kbnc(char_addr, &p);
-			ASSERT(HERE, cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!");
+			ASSERT(cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!");
 			// Next 2 entries in in_line are how-far-factored and "# of PRP tests that will be saved if P-1 is done and finds a factor":
 			TF_BITS = 0xffffffff; tests_saved = 0.0;
 			if((char_addr = strstr(cptr, ",")) != 0x0) {
 				cptr++;
 				// Only check if there's an appropriate TF_BITS entry in the input line
 				TF_BITS = strtoul(++char_addr, &endp, 10);
-				ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found after TF_BITS field in assignment-specifying line!");	cptr++;
+				ASSERT((char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found after TF_BITS field in assignment-specifying line!");	cptr++;
 				tests_saved = strtod(++char_addr, &endp);
 				if(tests_saved < 0 || tests_saved > 2) {
-					sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved);	ASSERT(HERE,0,cbuf);
+					sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved);	ASSERT(0,cbuf);
 				}
 				// char_addr now points to leftmost char of tests_saved field, which we will overwrite with 0;
 				// endp points to to-be-appended leftover portion
@@ -671,14 +671,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				// Create p-1 assignment, then edit original assignment line appropriately
 				TEST_TYPE = TEST_TYPE_PM1;
 				kblocks = get_default_fft_length(p);
-				ASSERT(HERE, pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!");
+				ASSERT(pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!");
 				// Format the p-1 assignment into cbuf - use cptr here, as need to preserve value of char_addr:
-				cptr = strstr(in_line, "=");	ASSERT(HERE,cptr != 0x0,"Malformed assignment!");
+				cptr = strstr(in_line, "=");	ASSERT(cptr != 0x0,"Malformed assignment!");
 				cptr++;	while(isspace(*cptr)) { ++cptr; }	// Skip any whitespace following the equals sign
 				if(is_hex_string(cptr, 32)) {
-					strncpy(aid,cptr,32);	sprintf(cbuf,"Pminus1=%s,1,2,%llu,-1,%u,%llu\n",aid,p,B1,B2);	// If we get here, it's a M(p), not F(m)
+					strncpy(aid,cptr,32);	sprintf(cbuf,"Pminus1=%s,1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",aid,p,B1,B2);	// If we get here, it's a M(p), not F(m)
 				} else
-					sprintf(cbuf,"Pminus1=1,2,%llu,-1,%u,%llu\n",p,B1,B2);
+					sprintf(cbuf,"Pminus1=1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",p,B1,B2);
 				// Copy up to the final (tests_saved) char of the assignment into cstr and append tests_saved = 0;
 				// A properly formatted tests_saved field is 1 char wide and begins at the current value of char_addr:
 				i = char_addr - in_line; strncpy(cstr,in_line, i); cstr[i] = '0'; cstr[i+1] = '\0';
@@ -695,17 +695,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				TEST_TYPE = TEST_TYPE_PRP;
 			} else {	// PRP double-check:
 				// NB: Hit a gcc compiler bug (which left i = 0 for e.g. char_addr = ", 3 ,...") using -O0 here ... clang compiled correctly, as did gcc -O1:
-				i = (int)strtol(char_addr+1, &cptr, 10); // PRP bases other than 3 allowed; see https://github.com/primesearch/Mlucas/issues/18 //	ASSERT(HERE, i == 3,"PRP-test base must be 3!");
+				i = (int)strtol(char_addr+1, &cptr, 10); // PRP bases other than 3 allowed; see https://github.com/primesearch/Mlucas/issues/18 //	ASSERT(i == 3,"PRP-test base must be 3!");
 				PRP_BASE = i;
-				ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found in assignment-specifying line!");
-				i = (int)strtol(char_addr+1, &cptr, 10); ASSERT(HERE, i == 1 || i == 5,"Only PRP-tests of type 1 (PRP-only) and type 5 (PRP and subsequent cofactor-PRP check) supported!");
+				ASSERT((char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found in assignment-specifying line!");
+				i = (int)strtol(char_addr+1, &cptr, 10); ASSERT(i == 1 || i == 5,"Only PRP-tests of type 1 (PRP-only) and type 5 (PRP and subsequent cofactor-PRP check) supported!");
 				// Read in known prime-factors, if any supplied - resulting factors end up in KNOWN_FACTORS[]:
 				if(*cptr == ',')						//vv--- Pass in unused file-ptr fq here in case function emits any messages:
 					nfac = extract_known_factors(p,cptr+1);
 				// Use 0-or-not-ness of KNOWN_FACTORS[0] to differentiate between PRP-only and PRP-CF:
 				if(KNOWN_FACTORS[0] != 0ull) {
-					ASSERT(HERE, i == 5,"Only PRP-CF tests of type 5 supported!");
-					if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) ASSERT(HERE, PRP_BASE == 3, "PRP-CF test base for Fermat numbers must be 3!");
+					ASSERT(i == 5,"Only PRP-CF tests of type 5 supported!");
+					if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) ASSERT(PRP_BASE == 3, "PRP-CF test base for Fermat numbers must be 3!");
 				}
 			}
 			goto GET_EXPO;
@@ -715,7 +715,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			char_addr += 6;
 			/* Look for comma following the modulus keyword and position next-keyword search right after it: */
 			if(!STREQN(char_addr,",",1))
-				ASSERT(HERE, 0,"Expected ',' not found in input following modulus type specifier!");
+				ASSERT(0,"Expected ',' not found in input following modulus type specifier!");
 			else
 				char_addr++;
 
@@ -729,7 +729,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			char_addr += 8;
 			/* Look for comma following the modulus keyword and position next-keyword search right after it: */
 			if(!STREQN(char_addr,",",1))
-				ASSERT(HERE, 0,"Expected ',' not found in input following modulus type specifier!");
+				ASSERT(0,"Expected ',' not found in input following modulus type specifier!");
 			else
 				char_addr++;
 		}
@@ -771,10 +771,10 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			char_addr += 7;
 			// Check [k,b,n,c] portion of in_line:
 			cptr = check_kbnc(char_addr, &p);
-			ASSERT(HERE, cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!");
-			ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
+			ASSERT(cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!");
+			ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
 			B1 = (uint32)strtoul (char_addr+1, &cptr, 10);
-			ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
+			ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
 			/* The C11 standard re. strtoull: "On success the function returns the converted integer as unsigned long long int type
 			and sets endPtr to point to the first character after the input number. On failure it returns 0 and sets endPtr to
 			point to NULL. It handles integer overflows efficiently and return ULONG_LONG_MAX on overflow."
@@ -782,12 +782,12 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			also set endPtr to point to the first character after the input, which leaves some ambiguity - what if the
 			input was in fact == ULONG_LONG_MAX? We assume here that nobody will use a p-1 stage bound so large:
 			*/
-			B2 = (uint64)strtoull(char_addr+1, &cptr, 10);	ASSERT(HERE, B2 != -1ull, "strtoull() overflow detected.");
+			B2 = (uint64)strtoull(char_addr+1, &cptr, 10);	ASSERT(B2 != -1ull, "strtoull() overflow detected.");
 			// Remaining args optional, with the 2 numerics presumed in-order, e.g. we only look for ',B2_start' field if ',TF_BITS' was present:
 			if((char_addr = strstr(cptr, ",")) != 0x0) {
-				TF_BITS = (int)strtoul(char_addr+1, &cptr, 10);	ASSERT(HERE, TF_BITS < 100 ,"TF_BITS value read from assignment is out of range.");
+				TF_BITS = (int)strtoul(char_addr+1, &cptr, 10);	ASSERT(TF_BITS < 100 ,"TF_BITS value read from assignment is out of range.");
 				if((char_addr = strstr(cptr, ",")) != 0x0) {
-					B2_start = (uint64)strtoull(char_addr+1, &cptr, 10);	ASSERT(HERE, B2_start != -1ull, "strtoull() overflow detected.");
+					B2_start = (uint64)strtoull(char_addr+1, &cptr, 10);	ASSERT(B2_start != -1ull, "strtoull() overflow detected.");
 					if(B2_start > B1)	// It's a stage 2 continuation run
 						s2_continuation = TRUE;
 					// Read in known prime-factors, if any supplied - resulting factors end up in KNOWN_FACTORS[]:
@@ -804,15 +804,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			char_addr += 7;
 			// Check [k,b,n,c] portion of in_line:
 			cptr = check_kbnc(char_addr, &p);
-			ASSERT(HERE, cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!");
-			ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
+			ASSERT(cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!");
+			ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
 			TF_BITS = (int)strtoul(char_addr+1, &cptr, 10);
-			ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
+			ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!");
 			tests_saved = strtod(++char_addr, &endp);
 			if(tests_saved < 0 || tests_saved > 2) {
-				sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved);	ASSERT(HERE,0,cbuf);
+				sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved);	ASSERT(0,cbuf);
 			}
-			ASSERT(HERE, pm1_set_bounds(p, get_default_fft_length(p)<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!");
+			ASSERT(pm1_set_bounds(p, get_default_fft_length(p)<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!");
 		}
 	#if INCLUDE_ECM
 		else if(strstr(char_addr, "ECM"))
@@ -823,13 +823,13 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	#endif
 		else
 		{
-			snprintf_nowarn(cbuf,STR_MAX_LEN,"WARN: Unrecognized/Unsupported option or empty assignment line. The ini file entry was %s\n",in_line);
+			snprintf(cbuf,STR_MAX_LEN*2,"WARN: Unrecognized/Unsupported option or empty assignment line. The ini file entry was %s\n",in_line);
 			fprintf(stderr,"%s",cbuf);
 			goto read_next_assignment;
 		}
 
 		if(!p) {	// For legacy assignment types, set p here
-			ASSERT(HERE, (char_addr = strstr(char_addr, "=")) != 0x0,"Expected '=' not found in assignment-specifying line!");
+			ASSERT((char_addr = strstr(char_addr, "=")) != 0x0,"Expected '=' not found in assignment-specifying line!");
 			char_addr++;
 			/* Skip any whitespace following the equals sign:*/
 			while(isspace(*char_addr)) { ++char_addr; }
@@ -839,19 +839,19 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			else if(STREQN_NOCASE(char_addr,"n/a",3))
 				char_addr = strstr(char_addr, ",") + 1;
 
-			p = strtoull(char_addr, &cptr, 10);	ASSERT(HERE, p != -1ull, "strtoull() overflow detected.");
+			p = strtoull(char_addr, &cptr, 10);	ASSERT(p != -1ull, "strtoull() overflow detected.");
 		}
 
 	GET_EXPO:
 		// Need to init this for savefile-naming code
-		ASSERT(HERE, p != 0ull, "Exponent has not been set!");
-		sprintf(ESTRING,"%llu",p);
+		ASSERT(p != 0ull, "Exponent has not been set!");
+		sprintf(ESTRING,"%" PRIu64,p);
 
 		// In PRP-test case, have already read the exponent from the worktodo line
 		/* Special case of user forcing a non-default FFT length for an exponent in the worktodo file: */
 		if(exponent && (p != exponent)) {	// || (MODULUS_TYPE != MODULUS_TYPE_MERSENNE))	15. Oct 2012: Need same flexibility for Fermat numbers (e.g. F27 @ 7168k) as for Mersennes, so disable modulus-type part of conditional
 			sprintf(cbuf,"User-supplied exponent and FFT-length for full-length test requires an exponent-matching 'Test=<exponent>' or 'DoubleCheck=<exponent>' %s entry!",WORKFILE);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		/* Check #bits in the Mersenne exponent vs. the allowed maximum: */
@@ -864,14 +864,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			if(findex <= MAX_PRIMALITY_TEST_BITS)
 				p = (uint64)1 << findex;
 			else
-				ASSERT(HERE, 0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS");
+				ASSERT(0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS");
 			// For purposes of the bits-in-p limit, treat 2^findex as having (findex) rather than (findex+1) bits:
 			nbits_in_p = findex;
 		}
 		else
-			ASSERT(HERE, 0,"MODULUS_TYPE unknown!");
+			ASSERT(0,"MODULUS_TYPE unknown!");
 
-		ASSERT(HERE, nbits_in_p <= MAX_EXPO_BITS,"Require nbits_in_p <= MAX_EXPO_BITS");
+		ASSERT(nbits_in_p <= MAX_EXPO_BITS,"Require nbits_in_p <= MAX_EXPO_BITS");
 
 	#if INCLUDE_TF
 
@@ -889,7 +889,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			/* For now, always start at k = 1: */
 			log2_min_factor = 0.0;
 			log2_max_factor = get_default_factoring_depth(p);
-			ASSERT(HERE, log2_max_factor <= MAX_FACT_BITS, "log2_max_factor > MAX_FACT_BITS!");
+			ASSERT(log2_max_factor <= MAX_FACT_BITS, "log2_max_factor > MAX_FACT_BITS!");
 
 			/* Field following the exponent is the already-factored-to depth: if none found, use defaults. */
 			char_addr = strstr(char_addr, ",");
@@ -951,7 +951,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				TF_BITS = strtoul(char_addr, &endp, 10);
 			#if INCLUDE_TF
 				if(TF_BITS > MAX_FACT_BITS) {
-					snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: TF_BITS of %u > max. allowed of %u. The ini file entry was %s\n", TF_BITS, MAX_FACT_BITS, in_line);
+					snprintf(cbuf,STR_MAX_LEN*2,"ERROR: TF_BITS of %u > max. allowed of %u. The ini file entry was %s\n", TF_BITS, MAX_FACT_BITS, in_line);
 					fprintf(stderr,"%s",cbuf);
 					goto GET_NEXT_ASSIGNMENT;
 				}
@@ -976,7 +976,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					pm1_done = strtoul(char_addr, &endp, 10);
 					if(pm1_done > 1) {
 						sprintf(cbuf, "ERROR: the specified pm1_done field [%u] should be 0 or 1!\n",pm1_done);
-						ASSERT(HERE,0,cbuf);
+						ASSERT(0,cbuf);
 					}
 					if(!pm1_done) {	// pm1_done == TRUE is a no-op, translating to "proceed with primality test"
 						// Don't actually use this in pm1_set_bounds(), due to the rise of the single-shot PRP-with-proof paradigm, but for form's sake:
@@ -984,14 +984,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 						// Create p-1 assignment, then edit original assignment line appropriately
 						TEST_TYPE = TEST_TYPE_PM1;
 						kblocks = get_default_fft_length(p);
-						ASSERT(HERE, pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!");
+						ASSERT(pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!");
 						// Format the p-1 assignment into cbuf:
-						char_addr = strstr(in_line, "=");	ASSERT(HERE,char_addr != 0x0,"Malformed assignment!");
+						char_addr = strstr(in_line, "=");	ASSERT(char_addr != 0x0,"Malformed assignment!");
 						char_addr++;	while(isspace(*char_addr)) { ++char_addr; }	// Skip any whitespace following the equals sign
 						if(is_hex_string(char_addr, 32)) {
-							strncpy(aid,char_addr,32);	sprintf(cbuf,"Pminus1=%s,1,2,%llu,-1,%u,%llu\n",aid,p,B1,B2);	// If we get here, it's a M(p), not F(m)
+							strncpy(aid,char_addr,32);	sprintf(cbuf,"Pminus1=%s,1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",aid,p,B1,B2);	// If we get here, it's a M(p), not F(m)
 						} else
-							sprintf(cbuf,"Pminus1=1,2,%llu,-1,%u,%llu\n",p,B1,B2);
+							sprintf(cbuf,"Pminus1=1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",p,B1,B2);
 
 						// Copy all but the final (pm1_done) char of the assignment into cstr and append pm1_done = 1. If in_line ends with newline, first --j:
 						j = strlen(in_line) - 1;	j -= (in_line[j] == '\n');
@@ -1012,7 +1012,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	else if(exponent != 0)	/* elseif((found WORKFILE) == FALSE) */
 	{
 		p = exponent;
-		fprintf(stderr," %s file not found...using user-supplied command-line exponent p = %llu\n",WORKFILE,p);
+		fprintf(stderr," %s file not found...using user-supplied command-line exponent p = %" PRIu64 "\n",WORKFILE,p);
 		/* This takes care of the number-to-char conversion and leading-whitespace-removal
 		in one step - use PSTRING for temporary storage here: */
 		strcpy(ESTRING, &PSTRING[convert_uint64_base10_char(PSTRING, p)]);
@@ -1023,7 +1023,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			if(findex <= MAX_PRIMALITY_TEST_BITS)
 				p = (uint64)1 << findex;
 			else
-				ASSERT(HERE, 0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS");
+				ASSERT(0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS");
 
 			/* For purposes of the bits-in-p limit, treat 2^findex as having
 			(findex) rather than (findex+1) bits: */
@@ -1033,15 +1033,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 
 		INTERACT=TRUE;
 
-		ASSERT(HERE,TEST_TYPE,"TEST_TYPE not set!");
-		ASSERT(HERE,TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!");
+		ASSERT(TEST_TYPE,"TEST_TYPE not set!");
+		ASSERT(TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!");
 
 		/* If nbits_in_p > MAX_PRIMALITY_TEST_BITS, it better be a TF run: */
 		if(TEST_TYPE == TEST_TYPE_TF)
 		{
 		#if INCLUDE_TF
 			/* Currently TF only supported for Mersennes: */
-			ASSERT(HERE, (MODULUS_TYPE == MODULUS_TYPE_MERSENNE), "Trial-factoring Currently only supported for Mersenne numbers");
+			ASSERT((MODULUS_TYPE == MODULUS_TYPE_MERSENNE), "Trial-factoring Currently only supported for Mersenne numbers");
 			/* For now, always start at k = 1: */
 			log2_min_factor = 0.0;
 			if(iterations) {
@@ -1051,15 +1051,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			else
 				log2_max_factor = get_default_factoring_depth(p);
 
-			ASSERT(HERE, log2_max_factor >=             0, "log2_max_factor must be positive!");
-			ASSERT(HERE, log2_max_factor <= MAX_FACT_BITS, "log2_max_factor exceeds MAX_FACT_BITS!");
+			ASSERT(log2_max_factor >=             0, "log2_max_factor must be positive!");
+			ASSERT(log2_max_factor <= MAX_FACT_BITS, "log2_max_factor exceeds MAX_FACT_BITS!");
 		#else
-			ASSERT(HERE, 0, "Trial-factoring not supported for this build/platform.");
+			ASSERT(0, "Trial-factoring not supported for this build/platform.");
 		#endif
 		}
 		else if(TEST_TYPE == TEST_TYPE_PM1)	/* P-1 factoring attempt */
 		{
-			ASSERT(HERE, nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring.");
+			ASSERT(nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring.");
 			pm1_check_bounds();
 			// Proper setting of timing_test_iters in this case needs us to compute the stage 1 prime-powers product:
 			// Compute stage 1 prime-powers product, store in PM1_S1_PRODUCT and store #bits of same in PM1_S1_PROD_BITS:
@@ -1075,8 +1075,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		else	/* Primality or PRP test */
 		{
 		/*	fprintf(stderr, "P = %u, nbits_in_p = %d\n",p,nbits_in_p);	*/
-			ASSERT(HERE, nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring.");
-			ASSERT(HERE,iterations != 0,"Timing test with User-supplied exponent requires number of iterations to be specified via the -iters flag!");
+			ASSERT(nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring.");
+			ASSERT(iterations != 0,"Timing test with User-supplied exponent requires number of iterations to be specified via the -iters flag!");
 			if(iterations <= 0) {
 				fprintf(stderr, " Specified %u self-test iterations : must be > 0.\n", iterations);
 				return ERR_TESTITERS_OUTOFRANGE;
@@ -1089,12 +1089,12 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	} else {
 		fprintf(stderr,"No %s file not found, nor user-supplied command-line exponent.\n",WORKFILE);
 		print_help();
-		ASSERT(HERE, 0, "Unsupported combination of command-line args. Note that if you are trying to\nrun a single-FFT-length self-test, you *must* explicitly specify the iteration\ncount, e.g. './Mlucas -fft 7168 <-iters [+int]> [-cpu <args>]'");
+		ASSERT(0, "Unsupported combination of command-line args. Note that if you are trying to\nrun a single-FFT-length self-test, you *must* explicitly specify the iteration\ncount, e.g. './Mlucas -fft 7168 <-iters [+int]> [-cpu <args>]'");
 	}	// endif(found WORKFILE?)
 
 	// If production run (not self-test), echo assignment to per-exponent logfile:
 	if(!INTERACT) {
-		snprintf_nowarn(cbuf,STR_MAX_LEN," %s entry: %s\n",WORKFILE,in_line);
+		snprintf(cbuf,STR_MAX_LEN*2," %s entry: %s\n",WORKFILE,in_line);
 		mlucas_fprint(cbuf,0);
 	}
 
@@ -1112,8 +1112,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	strcpy(STATFILE, RESTARTFILE);
 	strcat(STATFILE, ".stat");
 	/*fprintf(stderr, "STATFILE = %s\n",STATFILE);	*/
-	ASSERT(HERE,TEST_TYPE,"TEST_TYPE not set!");
-	ASSERT(HERE,TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!");
+	ASSERT(TEST_TYPE,"TEST_TYPE not set!");
+	ASSERT(TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!");
 
 	/* Fom this point onward the first character of restart filenames is context-dependent: */
 #if INCLUDE_TF
@@ -1124,7 +1124,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		if(!ASSIGNMENT_TYPE_MATRIX[MODULUS_TYPE][TEST_TYPE_TF])
 		{
 			sprintf(cbuf, "TEST_TYPE_TF with MODULUS_TYPE = %u not supported!\n", MODULUS_TYPE);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		}
 
 		factor(ESTRING, log2_min_factor, log2_max_factor);
@@ -1133,17 +1133,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 #endif
 	if(TEST_TYPE > TEST_TYPE_MAX)
 	{
-		ASSERT(HERE, 0,"ERROR: Unrecognized assignment type in savefile processing.\n");
+		ASSERT(0,"ERROR: Unrecognized assignment type in savefile processing.\n");
 	}
 	/* endif(TEST_TYPE == ...) */
 
 /********************* P-1, primality, or PRP Test: ***********************************************/
 
 	if(p < PMIN) {
-		fprintf(stderr, " p must be at least %llu.\n",PMIN);
+		fprintf(stderr, " p must be at least %" PRIu64 ".\n",PMIN);
 		return ERR_EXPONENT_ILLEGAL;
 	} else if(p > PMAX) {
-		fprintf(stderr, " p must be no greater than %llu.\n",PMAX);
+		fprintf(stderr, " p must be no greater than %" PRIu64 ".\n",PMAX);
 		return ERR_EXPONENT_ILLEGAL;
 	}
 
@@ -1173,7 +1173,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		}
 
 		TRANSFORM_TYPE = REAL_WRAPPER;
-		snprintf_nowarn(PSTRING,STR_MAX_LEN, "M%s", ESTRING);
+		snprintf(PSTRING,STR_MAX_LEN, "M%s", ESTRING);
 		/* v19:
 		Unlike standard mod-M(p) Fermat-PRP test, x0^(N-1) ?== 1 (mod N) which for N = M(p) gives N-1 = 2^p-2
 		= 0b111[p-1 binary 1s]1110 and thus requires [p-2 (x := x^2*base) steps followed by 1 final squaring], the
@@ -1192,22 +1192,22 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			pm1_check_bounds();
 			s1p_alloc = compute_pm1_s1_product(p);
 			maxiter = PM1_S1_PROD_BITS;	// NOTE: In this case we don't want to override the PRP_BASE = 3 value set in compute_pm1_s1_product()
-			ASSERT(HERE, B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!");
+			ASSERT(B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!");
 			RES_SHIFT = 0ull;	// Must set = 0 here to make sure BASE_MULTIPLIER_BITS array gets set = 0 below
 		} else
-			ASSERT(HERE,0,"Unsupported test type! (Neither LL,PRP nor P-1)");
+			ASSERT(0,"Unsupported test type! (Neither LL,PRP nor P-1)");
 	}
 	else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
 	#ifdef USE_ARM_V8_SIMD
-		ASSERT(HERE, 0, "ARMv8 SIMD builds do not support Fermat-number testing!");
+		ASSERT(0, "ARMv8 SIMD builds do not support Fermat-number testing!");
 	#endif
-		ASSERT(HERE,findex >= 13 && findex < 64, "Fermat number index must be in range [13,63]!\n");
+		ASSERT(findex >= 13 && findex < 64, "Fermat number index must be in range [13,63]!\n");
 		// This takes care of the number-to-char conversion and leading-whitespace-removal
 		// in one step - use PSTRING for temporary storage here:
 		strcpy(ESTRING, &PSTRING[convert_uint64_base10_char(PSTRING, (uint64)findex)]);
-		ASSERT(HERE, (p >> findex) == 1,"Require (p >> findex) == 1");
-		sprintf(BIN_EXP,"%llu",p);	// May need this for workfile postprocessing if assignment is in KBNC format
+		ASSERT((p >> findex) == 1,"Require (p >> findex) == 1");
+		sprintf(BIN_EXP,"%" PRIu64,p);	// May need this for workfile postprocessing if assignment is in KBNC format
 		TRANSFORM_TYPE = RIGHT_ANGLE;
 		sprintf(PSTRING, "F%u", findex);
 		if(TEST_TYPE == TEST_TYPE_PRIMALITY) {
@@ -1215,17 +1215,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			PRP_BASE = 2;	// v20: Pépin test doesn't use this as the initial seed (that defaults to 3), but rather for the random-shift
 							// offsets used to prevent the shift count from modding to 0 as a result of repeated doublings (mod 2^m)
 		} else if(TEST_TYPE == TEST_TYPE_PRP) {
-			ASSERT(HERE, KNOWN_FACTORS[0] != 0, "Fermat-mod PRP test implies a PRP-CF run, but no known-factors provided!");
+			ASSERT(KNOWN_FACTORS[0] != 0, "Fermat-mod PRP test implies a PRP-CF run, but no known-factors provided!");
 			RES_SHIFT = 0ull;	// Must set = 0 here to make sure BASE_MULTIPLIER_BITS array gets set = 0 below
 		} else if(TEST_TYPE == TEST_TYPE_PM1) {
 			// Compute stage 1 prime-powers product, store in PM1_S1_PRODUCT, store #bits of same in PM1_S1_PROD_BITS:
 			pm1_check_bounds();
 			s1p_alloc = compute_pm1_s1_product(p);
 			maxiter = PM1_S1_PROD_BITS;	// NOTE: In this case we don't want to override the PRP_BASE = 3 value set in compute_pm1_s1_product()
-			ASSERT(HERE, B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!");
+			ASSERT(B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!");
 			RES_SHIFT = 0ull;	// Must set = 0 here to make sure BASE_MULTIPLIER_BITS array gets set = 0 below
 		} else
-			ASSERT(HERE,0,"Unsupported test type! (Neither Pepin-primality nor P-1)");
+			ASSERT(0,"Unsupported test type! (Neither Pepin-primality nor P-1)");
 
 		j = ((ITERS_BETWEEN_CHECKPOINTS+63) >> 6);
 		if(RES_SHIFT == 0ull) {
@@ -1244,14 +1244,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		}
 	}
 	else {
-		ASSERT(HERE, 0,"Unknown Self-Test Modulus Type!");
+		ASSERT(0,"Unknown Self-Test Modulus Type!");
 	}	/* endif(MODULUS_TYPE) */
 
 	// mi64_shlc currently limited to 32-bit shift counts - for technical reasons described in comments at top of that function,
 	// the largest exponent testable-with-shift must satisfy condition below, which yields largest M(p) with p = 4294967231 = 2^32-65:
 	if(RES_SHIFT && (p+63) > 0xFFFFFFFFull) {
 		sprintf(cbuf,"ERROR: Exponents this large do not support residue shift! Please run with '-shift 0'.\n");
-		ASSERT(HERE,0,cbuf);
+		ASSERT(0,cbuf);
 	}
 	/* In production-run (INTERACT = False) mode, allow command-line-forced FFT lengths which are at most
 	"one size too large" relative to the default length for the exponent in question. Supported lengths
@@ -1261,7 +1261,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	kblocks = get_default_fft_length(p);
 	if(!fft_length || (!INTERACT && MODULUS_TYPE == MODULUS_TYPE_MERSENNE && 8*fft_length > 9*kblocks)) {
 		if(!kblocks) {
-			fprintf(stderr,"ERROR detected in get_default_fft_length for p = %llu.\n",p);
+			fprintf(stderr,"ERROR detected in get_default_fft_length for p = %" PRIu64 ".\n",p);
 			return ERR_FFTLENGTH_ILLEGAL;
 		}
 	} else {
@@ -1319,8 +1319,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			/* Only allow lengths that are <= 2x default */
 			if( !(i >= kblocks && i <= (kblocks<<1) ) )
 			{
-				sprintf(cbuf,"Call to get_preferred_fft_radix returns out-of-range FFT length: asked for %u, returned %u, packed value= 0x%8X\n", kblocks, i, dum);
-				ASSERT(HERE, 0, cbuf);
+				sprintf(cbuf,"Call to get_preferred_fft_radix returns out-of-range FFT length: asked for %u, returned %u, packed value= %#8X\n", kblocks, i, dum);
+				ASSERT(0, cbuf);
 			}
 			else	/* If length acceptable, extract the FFT-radix data encoded and populate the NRADICES and RADIX_VEC[] globals */
 			{
@@ -1329,9 +1329,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				/* Make sure the FFT length is supported: */
 				if(get_fft_radices(kblocks, 0, 0x0, 0x0, 0) != 0)
 				{
-					ASSERT(HERE, get_fft_radices(kblocks, 0, 0x0, 0x0, 0) == ERR_FFTLENGTH_ILLEGAL, "Unexpected return value for get_fft_radices()");
+					ASSERT(get_fft_radices(kblocks, 0, 0x0, 0x0, 0) == ERR_FFTLENGTH_ILLEGAL, "Unexpected return value for get_fft_radices()");
 					sprintf(cbuf, "ERROR: length %d = %d K not available.\n",n,kblocks);
-					fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 			}
 		}
@@ -1348,7 +1348,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	print a warning if the p/pmax ratio > 1 to an acceptably small degree; error out if the ratio is unreasonably > 1:
 	*/
 	uint64 pmax_rec = given_N_get_maxP(n);	double exp_ratio =  (double)p/pmax_rec;
-	fprintf(stderr, "INFO: Maximum recommended exponent for FFT length (%u Kdbl) = %llu; p[ = %llu]/pmax_rec = %12.10f.\n",kblocks,pmax_rec,p,exp_ratio);
+	fprintf(stderr, "INFO: Maximum recommended exponent for FFT length (%u Kdbl) = %" PRIu64 "; p[ = %" PRIu64 "]/pmax_rec = %12.10f.\n",kblocks,pmax_rec,p,exp_ratio);
 	// Set initial value of USE_SHORT_CY_CHAIN based on how close p/pmax is to 1.0, but only if current chain length is longer
 	// (e.g. if ROE-retry logic has led to a shorter-than-default chain length, don't revert to default):
 	if(exp_ratio > 0.99 && USE_SHORT_CY_CHAIN < 3)
@@ -1369,7 +1369,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			fprintf(stderr, "INFO: specified FFT length %d K is less than recommended %d K for this p.\n",kblocks,i);
 		else {
 			sprintf(cbuf, "ERROR: specified FFT length %d K is much too small: Recommended length for this p = %d K ... quitting.\n",kblocks,i);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		}
 	}
 
@@ -1383,14 +1383,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	}
 	/*...If array padding turned on, check that the blocklength divides the unpadded runlength...	*/
 	if((DAT_BITS < 31) && ((n >> DAT_BITS) << DAT_BITS) != n)
-		ASSERT(HERE, 0,"ERROR: blocklength does not divide runlength!");
+		ASSERT(0,"ERROR: blocklength does not divide runlength!");
 
 	/*...Find padded array length...	*/
 	npad = n + ( (n >> DAT_BITS) << PAD_BITS );	/* length of padded data array.	*/
 	/* If the residue and other modulus-size-dependent data arrays too small for the new assignment, deallocate them: */
 	if(nalloc > 0 && npad > nalloc)
 	{
-		ASSERT(HERE, a_ptmp != 0x0 && a != 0x0 && b != 0x0 && c != 0x0 && d != 0x0,"Require (a_ptmp,a,b,c,d) != 0x0");
+		ASSERT(a_ptmp != 0x0 && a != 0x0 && b != 0x0 && c != 0x0 && d != 0x0,"Require (a_ptmp,a,b,c,d) != 0x0");
 		free((void *)a_ptmp); a_ptmp = a = b = c = d = e = 0x0; b_uint64_ptr = c_uint64_ptr = d_uint64_ptr = e_uint64_ptr = 0x0;
 		free((void *)arrtmp); arrtmp=0x0;
 		free((void *)BIGWORD_BITMAP);	BIGWORD_BITMAP = 0x0;
@@ -1406,9 +1406,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		j = 0;
 		if(npad & 7)
 			j = 8 - (npad & 7);
-		nalloc = npad + j;	ASSERT(HERE, (nalloc & 7) == 0,"nalloc must be a multiple of 8!");	// This is so b,c,d enjoy same 64-byte alignment as a[]
+		nalloc = npad + j;	ASSERT((nalloc & 7) == 0,"nalloc must be a multiple of 8!");	// This is so b,c,d enjoy same 64-byte alignment as a[]
 		nbytes = nalloc<<3;
-		ASSERT(HERE, a_ptmp == 0x0 && a == 0x0 && b == 0x0 && c == 0x0 && d == 0x0 && e == 0x0 && arrtmp == 0x0,"Require (a_ptmp,b,c,d,e,arrtmp) == 0x0");
+		ASSERT(a_ptmp == 0x0 && a == 0x0 && b == 0x0 && c == 0x0 && d == 0x0 && e == 0x0 && arrtmp == 0x0,"Require (a_ptmp,b,c,d,e,arrtmp) == 0x0");
 		if(use_lowmem == 2) {	// Handy for huge-FFT self-tests on low-mem systems
 			sprintf(cbuf,"WARN: Low-memory[%u] run mode disallows PRP-testing|Gerbicz-check and p-1 stage 2.\n",use_lowmem);
 			mlucas_fprint(cbuf,1);
@@ -1416,11 +1416,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		} else {
 			j = 5;
 		}
-		a_ptmp = ALLOC_DOUBLE(a_ptmp, j*nalloc);	if(!a_ptmp){ sprintf(cbuf, "ERROR: unable to allocate array A in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		a_ptmp = ALLOC_DOUBLE(a_ptmp, j*nalloc);	if(!a_ptmp){ sprintf(cbuf, "ERROR: unable to allocate array A in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		a      = ALIGN_DOUBLE(a_ptmp);
-		ASSERT(HERE, ((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
+		ASSERT(((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
 		if(((intptr_t)a & 127) != 0x0)
-			fprintf(stderr, "WARN: a[] = 0x%08lX not aligned on 128-byte boundary!\n", (intptr_t)a);
+			fprintf(stderr, "WARN: a[] = %#08" PRIXPTR " not aligned on 128-byte boundary!\n", (intptr_t)a);
 		// v19: Add three more full-residue arrays to support 2-input FFT-modmul needed for Gerbicz check (and later, p-1 support):
 		if(use_lowmem < 2) {
 			b = a + nalloc;	c = b + nalloc;	d = c + nalloc, e = d + nalloc;
@@ -1431,14 +1431,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		// For multi-FFT-length self-tests, conservatively figure as many as 20 bits (2.5 bytes) per float-double residue word:
 		// v20: for largest currently supported FFT of 512Mdoubles, i still -barely - fits in a uint32, but 2.5*i does not:
 		arrtmp_alloc = i; arrtmp_alloc = MAX((p+63)>>2, (uint64)(arrtmp_alloc*2.5)) >> 3;	// #limb needed to store p bits = (p+63)>>6, so alloc at least 2x this
-		arrtmp = ALLOC_UINT64(arrtmp, arrtmp_alloc);if(!arrtmp ){ sprintf(cbuf, "ERROR: unable to allocate array ARRTMP with %u bytes in main.\n",i); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		arrtmp = ALLOC_UINT64(arrtmp, arrtmp_alloc);if(!arrtmp ){ sprintf(cbuf, "ERROR: unable to allocate array ARRTMP with %u bytes in main.\n",i); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 		// For an n-word main-array, BIGWORD_BITMAP and BIGWORD_NBITS have (n/64) elts each, thus need 1/64 + 1/32 the total
 		// storage of the main-array. Use uint64 alloc-macro for both, so halve the num-elts arg for the BIGWORD_NBITS alloc.
 		// As with above arrays, for multi-length self-test, alloc based on max. FFT length used (i) rather than current length (n).
 		// Don't need any array padding on these bitmap arrays, but since nalloc includes padding, no harm in using it:
-		BIGWORD_BITMAP =           ALLOC_UINT64(BIGWORD_BITMAP, nalloc>>6);	if(!BIGWORD_BITMAP){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_BITMAP in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		BIGWORD_NBITS  = (uint32 *)ALLOC_UINT64(BIGWORD_NBITS , nalloc>>7);	if(!BIGWORD_NBITS ){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_NBITS in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		BIGWORD_BITMAP =           ALLOC_UINT64(BIGWORD_BITMAP, nalloc>>6);	if(!BIGWORD_BITMAP){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_BITMAP in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		BIGWORD_NBITS  = (uint32 *)ALLOC_UINT64(BIGWORD_NBITS , nalloc>>7);	if(!BIGWORD_NBITS ){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_NBITS in main.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 	}
 // Multithreaded-code debug: Set address to watch:
 #ifdef MULTITHREAD
@@ -1456,8 +1456,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		i = ITERS_BETWEEN_GCHECKS;
 		j = ITERS_BETWEEN_GCHECK_UPDATES;
 		k = ITERS_BETWEEN_CHECKPOINTS;
-		ASSERT(HERE, i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)");
-		ASSERT(HERE, i%k == 0 && k%j == 0, "G-checkproduct update interval must divide savefile-update one, which must divide the G-check interval");
+		ASSERT(i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)");
+		ASSERT(i%k == 0 && k%j == 0, "G-checkproduct update interval must divide savefile-update one, which must divide the G-check interval");
 	}
 
 	// PRP-test: Init bitwise multiply-by-base array - cf. comment re. modified Fermat-PRP needed by Gerbicz check
@@ -1483,7 +1483,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		if(fp) {
 			if(TEST_TYPE == TEST_TYPE_PRP) {
 				dum = PRP_BASE;
-				ASSERT(HERE, use_lowmem < 2, "PRP-test mode not available in Low-memory[2] run mode!");
+				ASSERT(use_lowmem < 2, "PRP-test mode not available in Low-memory[2] run mode!");
 			}
 			i = read_ppm1_savefiles(cstr, p, &j, fp, &itmp64,
 												(uint8*)arrtmp      , &Res64,&Res35m1,&Res36m1,	// Primality-test residue
@@ -1495,17 +1495,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				if(strstr(cbuf, "read_ppm1_savefiles"))
 					mlucas_fprint(cbuf,1);
 				/* And now for the official spokesmessage: */
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: read_ppm1_savefiles Failed on savefile %s!\n",cstr);
+				snprintf(cbuf,STR_MAX_LEN*2, "ERROR: read_ppm1_savefiles Failed on savefile %s!\n",cstr);
 				mlucas_fprint(cbuf,1);
 
 				if(ierr == ERR_GERBICZ_CHECK) {
 					sprintf(cbuf,"Failed to correctly read last-good-Gerbicz-check data savefile!");
-					mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+					mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 				} else if(cstr[0] != 'q') {
 					cstr[0] = 'q';	goto READ_RESTART_FILE;
 				} else {
 					sprintf(cbuf,"Failed to correctly read both primary or secondary savefile!");
-					mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+					mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 				}
 			}
 			// If user attempts to restart run with different PRP base than it was started with, ignore the new value and continue with the initial one:
@@ -1546,33 +1546,33 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			*/
 			if(ierr == ERR_GERBICZ_CHECK) {
 				MOD_ADD64(RES_SHIFT,RES_SHIFT,p,RES_SHIFT);
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "Gerbicz-check-error restart: Mod-doubling residue shift to avoid repeating any possible fractional-error aliasing in retry, new shift = %llu\n",RES_SHIFT);
+				snprintf(cbuf,STR_MAX_LEN*2, "Gerbicz-check-error restart: Mod-doubling residue shift to avoid repeating any possible fractional-error aliasing in retry, new shift = %" PRIu64 "\n",RES_SHIFT);
 				mlucas_fprint(cbuf,1);
 			}
 			/* Allocate floating-point residue array and convert savefile bytewise residue to floating-point form, after
 			first applying required circular shift read into the global RES_SHIFT during the above bytewise-savefile read.
 			*/
 			if(!convert_res_bytewise_FP((uint8*)arrtmp, a, n, p)) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",cstr);
+				snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",cstr);
 				mlucas_fprint(cbuf,0);
 				if(cstr[0] != 'q' && !(ierr == ERR_GERBICZ_CHECK)) {	// Secondary savefile only exists for regular checkpoint files
 					cstr[0] = 'q';
 					goto READ_RESTART_FILE;
 				} else {
-					ASSERT(HERE,0,cbuf);
+					ASSERT(0,cbuf);
 				}
 			}
 			// v19: G-check residue - we only create savefile for PRP-phase of any PRP-CF run, i.e. always expect a G-check residue:
 		  if(DO_GCHECK) {
 			if(!convert_res_bytewise_FP((uint8*)e_uint64_ptr, b, n, p)) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on Gerbicz-check residue read from savefile %s!\n",cstr);
-				mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+				snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on Gerbicz-check residue read from savefile %s!\n",cstr);
+				mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 			} else {
 				ierr = 0;
 				s1 = sum64(b_uint64_ptr, n); s2 = s3 = s1;	// Init triply-redundant checksum of G-checkproduct
 			}
 		  }
-			ASSERT(HERE, ilo > 0,"Require ilo > 0!");
+			ASSERT(ilo > 0,"Require ilo > 0!");
 			ihi = ilo+ITERS_BETWEEN_CHECKPOINTS;
 			/* If for some reason last checkpoint was at a non-multiple of ITERS_BETWEEN_CHECKPOINTS, round down: */
 			ihi-= ihi%ITERS_BETWEEN_CHECKPOINTS;
@@ -1584,11 +1584,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		{
 			/* If we're on the primary restart file, set up for secondary: */
 			if(ierr == ERR_GERBICZ_CHECK || s2_continuation) {	// Secondary savefile only exists for regular checkpoint files
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "INFO: Needed restart file %s not found...moving on to next assignment in %s.\n",cstr,WORKFILE);
+				snprintf(cbuf,STR_MAX_LEN*2, "INFO: Needed restart file %s not found...moving on to next assignment in %s.\n",cstr,WORKFILE);
 				mlucas_fprint(cbuf,1);
 				goto GET_NEXT_ASSIGNMENT;
 			} else if(cstr[0] != 'q') {
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "INFO: primary restart file %s not found...looking for secondary...\n",cstr);
+				snprintf(cbuf,STR_MAX_LEN*2, "INFO: primary restart file %s not found...looking for secondary...\n",cstr);
 				mlucas_fprint(cbuf,1);
 				cstr[0] = 'q';
 				goto READ_RESTART_FILE;
@@ -1611,11 +1611,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			ihi = ITERS_BETWEEN_CHECKPOINTS;
 	}
 
-	ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!");
-	ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
+	ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!");
+	ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
 
 	/* If at the start of a p-1 or primality test, set the initial seed for the run: */
-	ASSERT(HERE, TEST_TYPE <= TEST_TYPE_MAX,"Given TEST_TYPE not supported!");
+	ASSERT(TEST_TYPE <= TEST_TYPE_MAX,"Given TEST_TYPE not supported!");
 	if(ilo == 0)
 	{
 		memset(a, 0, npad*sizeof(double));
@@ -1638,7 +1638,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		// In theory could allow residue-shift during P-1, at least in stage 1, but in practice need the BASE_MULTIPLIER_BITS array
 		// to hold the part of the stage 1 prime-powers product needed for the current iteration interval of the stage 1 powering:
 		if(TEST_TYPE == TEST_TYPE_PM1) {
-			ASSERT(HERE, RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n");
+			ASSERT(RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n");
 			RES_SHIFT = 0ull; a[0] = iseed;
 		} else {
 			// Apply initial-residue shift - if user has not set one via cmd-line or current value >= p, randomly choose a value in [0,p).
@@ -1658,24 +1658,24 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			}
 			// Since residue is otherwise 0, use shifted-carryin function on double-precision padded-array residue:
 			itmp64 = shift_word(a, n, p, RES_SHIFT, (double)iseed);	// Note return value (specifically high 7 bytes thereof) is an unpadded index
-			ASSERT(HERE, (itmp64 >>  8) < n                , "Return value of shift_word(): unpadded-array-index out of range!");
-			ASSERT(HERE, (itmp64 & 255) < ceil((double)p/n), "Return value of shift_word(): bit-in-array-word value out of range!");
+			ASSERT((itmp64 >>  8) < n                , "Return value of shift_word(): unpadded-array-index out of range!");
+			ASSERT((itmp64 & 255) < ceil((double)p/n), "Return value of shift_word(): bit-in-array-word value out of range!");
 		}
 	} else if(DO_GCHECK) {
 		if(MODULUS_TYPE == MODULUS_TYPE_FERMAT && TEST_TYPE == TEST_TYPE_PRIMALITY && !INTERACT) {	// Allow shift in timing-test mode
-			ASSERT(HERE, RES_SHIFT == 0ull, "Shifted residues unsupported for Pépin test with Gerbicz check!\n");
+			ASSERT(RES_SHIFT == 0ull, "Shifted residues unsupported for Pépin test with Gerbicz check!\n");
 		}
 		memcpy(d, b, nbytes);	// If doing a PRP test, init redundant copy d[] Gerbicz residue-product accumulator b[].
 	}
 
 	if(restart) {
-		if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf_nowarn(cbuf,STR_MAX_LEN, "Restarting %s at iteration = %u, residue shift count = %llu.\nRes64,Res35m1,Res36m1: %016llX,%llu,%llu\n",PSTRING,ilo,RES_SHIFT,Res64,Res35m1,Res36m1);
-		else snprintf_nowarn(cbuf,STR_MAX_LEN, "Restarting %s at iteration = %u. Res64: %016llX, residue shift count = %llu\n",PSTRING,ilo,Res64,RES_SHIFT);
+		if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf(cbuf,STR_MAX_LEN*2, "Restarting %s at iteration = %u, residue shift count = %" PRIu64 ".\nRes64,Res35m1,Res36m1: %016" PRIX64 ",%" PRIu64 ",%" PRIu64 "\n",PSTRING,ilo,RES_SHIFT,Res64,Res35m1,Res36m1);
+		else snprintf(cbuf,STR_MAX_LEN*2, "Restarting %s at iteration = %u. Res64: %016" PRIX64 ", residue shift count = %" PRIu64 "\n",PSTRING,ilo,Res64,RES_SHIFT);
 		mlucas_fprint(cbuf,0);
 	}
 
 	/*...Restart and FFT info.	*/
-	snprintf_nowarn(cbuf,STR_MAX_LEN,"%s: using FFT length %uK = %u 8-byte floats, initial residue shift count = %llu\n",PSTRING,kblocks,n,RES_SHIFT);
+	snprintf(cbuf,STR_MAX_LEN*2,"%s: using FFT length %uK = %u 8-byte floats, initial residue shift count = %" PRIu64 "\n",PSTRING,kblocks,n,RES_SHIFT);
 	sprintf(cstr,"This gives an average %20.15f bits per digit\n",1.0*p/n);	strcat(cbuf,cstr);
 	if(TEST_TYPE == TEST_TYPE_PRP) {
 		sprintf(cstr,"The test will be done in form of a %u-PRP test.\n",PRP_BASE);	strcat(cbuf,cstr);
@@ -1774,13 +1774,13 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	int update_shift = (RES_SHIFT != 0ull);	// If shift = 0 at outset, don't update (only need for Fermat-mod, due to the random-bit aspect there)
 
 	if(TEST_TYPE == TEST_TYPE_PM1 && ilo >= maxiter) {
-		ASSERT(HERE, ilo == maxiter && ilo == PM1_S1_PROD_BITS,"For completed S1 expect ilo == maxiter == PM1_S1_PROD_BITS!");
-		snprintf_nowarn(cbuf,STR_MAX_LEN, "%s: p-1 stage 1 to b1 = %u already done -- proceeding to stage 2.\n",PSTRING,B1);
+		ASSERT(ilo == maxiter && ilo == PM1_S1_PROD_BITS,"For completed S1 expect ilo == maxiter == PM1_S1_PROD_BITS!");
+		snprintf(cbuf,STR_MAX_LEN*2, "%s: p-1 stage 1 to b1 = %u already done -- proceeding to stage 2.\n",PSTRING,B1);
 		fprintf(stderr,"%s",cbuf);
 		ilo = ihi;		// Need this to differentiate between just-completed S1 and S1 residue read from restart file,
 		goto PM1_STAGE2;// in terms of whether we need to do a GCD before proceeding to S2
 	} else if(KNOWN_FACTORS[0] != 0ull) {	// PRP-CF - but if ilo < (p-1) it's in the PRP-phase, handle like regular PRP run until that completes
-		ASSERT(HERE, TEST_TYPE == TEST_TYPE_PRP,"One or more known-factors in workfile entry requires a PRP= assignment type!");
+		ASSERT(TEST_TYPE == TEST_TYPE_PRP,"One or more known-factors in workfile entry requires a PRP= assignment type!");
 		if( ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) && (ilo >= p))
 		 || ((MODULUS_TYPE == MODULUS_TYPE_FERMAT) && (ilo >= p-1)) )
 			goto PM1_STAGE2;	// The CF-handling is a clause of the if/else beginning at this label
@@ -1788,7 +1788,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 
 	for(;;)
 	{
-		ASSERT(HERE, maxiter > 0,"Require (uint32)maxiter > 0");
+		ASSERT(maxiter > 0,"Require (uint32)maxiter > 0");
 		if(ihi > maxiter)
 			ihi = maxiter;
 		// If p-1: start of each iteration cycle, copy bits ilo:ihi-1 of PM1_S1_PRODUCT into low bits of BASE_MULTIPLIER_BITS vector:
@@ -1806,8 +1806,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			itmp64 = ~(-1ull << j); BASE_MULTIPLIER_BITS[i-1] &= itmp64;// ...and zero any excess bits at the high end.
 			for(i = 0, itmp64 = 0ull; i < s1p_alloc; i++) { itmp64 += PM1_S1_PRODUCT[i]; }
 			if(itmp64 != PM1_S1_PROD_RES64) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN,"PM1_S1_PRODUCT (mod 2^64_ checksum mismatch! (Current[%llu] != Reference[%llu]). Aborting due to suspected data corruption.\n",itmp64,PM1_S1_PROD_RES64);
-				mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+				snprintf(cbuf,STR_MAX_LEN*2,"PM1_S1_PRODUCT (mod 2^64_ checksum mismatch! (Current[%" PRIu64 "] != Reference[%" PRIu64 "]). Aborting due to suspected data corruption.\n",itmp64,PM1_S1_PROD_RES64);
+				mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 			}
 		}
 		/* Here's the big one - (ITERS_BETWEEN_CHECKPOINTS) squaring steps.
@@ -1827,7 +1827,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			i = ilo;	tdiff = 0.0;	// Need 2 timers here - tdif2 for the individual func_mod_square calls, accumulate in tdiff
 			while(!ierr && MLUCAS_KEEP_RUNNING && i < ihi) {
 				// See G-check code for why this logfile-print of initial-G-check-update residue shift value is needed in Fermat-mod case:
-				if(i == ITERS_BETWEEN_GCHECK_UPDATES) { sprintf(cbuf,"At iter ITERS_BETWEEN_GCHECK_UPDATES = %u: RES_SHIFT = %llu\n",i,RES_SHIFT); mlucas_fprint(cbuf,1); }
+				if(i == ITERS_BETWEEN_GCHECK_UPDATES) { sprintf(cbuf,"At iter ITERS_BETWEEN_GCHECK_UPDATES = %u: RES_SHIFT = %" PRIu64 "\n",i,RES_SHIFT); mlucas_fprint(cbuf,1); }
 				/* If restart-after-interrupt and thus ilo neither a non-multiple of ITERS_BETWEEN_CHECKPOINTS nor of
 				ITERS_BETWEEN_GCHECK_UPDATES, round first i-update > ilo to nearest multiple of ITERS_BETWEEN_GCHECK_UPDATES:
 				*/
@@ -1877,8 +1877,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 						fprintf(stderr,"Caught interrupt in fFFT(c) step.\n");
 						break;
 					} else {
-						snprintf_nowarn(cbuf,STR_MAX_LEN,"Unhandled Error of type[%u] = %s in fFFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr));
-						mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+						snprintf(cbuf,STR_MAX_LEN*2,"Unhandled Error of type[%u] = %s in fFFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr));
+						mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 					//	goto GET_NEXT_ASSIGNMENT;
 					}
 				}
@@ -1891,7 +1891,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					} else if(s1 == sum64(d_uint64_ptr, n)) {	// c-data good, copy back into b
 						memcpy(d, b, nbytes);
 					} else	// Catastrophic data corruption
-						ASSERT(HERE, 0, "Catastrophic data corruption detected in G-checkproduct integrity validation ... rolling back to last good G-check. ");
+						ASSERT(0, "Catastrophic data corruption detected in G-checkproduct integrity validation ... rolling back to last good G-check. ");
 				}
 
 			// First subinterval: [b] needs fwd-weighting and initial-fwd-FFT-pass done on entry, !undone on exit: mode_flag = 10_2
@@ -1907,8 +1907,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 						fprintf(stderr,"Caught interrupt in FFT(b)*FFT(c) step.\n");
 						break;
 					} else {
-						snprintf_nowarn(cbuf,STR_MAX_LEN,"Unhandled Error of type[%u] = %s in FFT(b)*FFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr));
-						mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+						snprintf(cbuf,STR_MAX_LEN*2,"Unhandled Error of type[%u] = %s in FFT(b)*FFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr));
+						mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 					//	goto GET_NEXT_ASSIGNMENT;
 					}
 				}
@@ -1991,7 +1991,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			}
 			// v20: Simplify the logic here - skip previous interval-retry step:
 			if((ierr == ERR_ROUNDOFF) && !INTERACT) {
-				ASSERT(HERE, ROE_ITER > 0, "ERR_ROUNDOFF returned but ROE_ITER <= 0!");
+				ASSERT(ROE_ITER > 0, "ERR_ROUNDOFF returned but ROE_ITER <= 0!");
 				n = get_nextlarger_fft_length(n);	kblocks = (n >> 10);
 				sprintf(cbuf," Switching to next-larger available FFT length %uK and restarting from last checkpoint file.\n",kblocks);
 				mlucas_fprint(cbuf,1);
@@ -2049,7 +2049,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S",local_time);
 			const char*iter_or_stage[] = {"Iter#","S1 bit"};	// Tag indicates Primality/PRP-test or p-1 S1 iteration
 			/*...print [date in hh:mm:ss | p | iter-count-or-stage progress | %-complete | time | per-iter time | Res64 | max ROE | residue-shift] */
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "[%s] %s %s = %u [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016llX. AvgMaxErr = %10.9f. MaxErr = %10.9f. Residue shift count = %llu.\n"
+			snprintf(cbuf,STR_MAX_LEN*2, "[%s] %s %s = %u [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016" PRIX64 ". AvgMaxErr = %10.9f. MaxErr = %10.9f. Residue shift count = %" PRIu64 ".\n"
 				, timebuffer, PSTRING, iter_or_stage[TEST_TYPE == TEST_TYPE_PM1], ihi, (float)ihi / (float)maxiter * 100,get_time_str(tdiff)
 				, 1000*get_time(tdiff)/(ihi - ilo), Res64, AME, MME, RES_SHIFT);
 			mlucas_fprint(cbuf,scrnFlag);
@@ -2089,8 +2089,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					fprintf(stderr,"Caught interrupt in Gerbicz-checkproduct mod-squaring update ... skipping G-check and savefile-update and performing immediate-exit.\n");
 					exit(1);
 				} else {
-					snprintf_nowarn(cbuf,STR_MAX_LEN,"Unhandled Error of type[%u] = %s in Gerbicz-checkproduct mod-squaring update - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr));
-					mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+					snprintf(cbuf,STR_MAX_LEN*2,"Unhandled Error of type[%u] = %s in Gerbicz-checkproduct mod-squaring update - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr));
+					mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 				//	goto GET_NEXT_ASSIGNMENT;
 				}
 			}
@@ -2127,7 +2127,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					if(filegrep(STATFILE,"ITERS_BETWEEN_GCHECK_UPDATES",cbuf,0)) {
 						char_addr = strstr(cbuf,"RES_SHIFT = ") + 12;	// Skip ahead by length of search-substring
 						itmp64 = strtoull(char_addr, &cptr, 10);
-						ASSERT(HERE, itmp64 != -1ull, "strtoull() overflow detected.");
+						ASSERT(itmp64 != -1ull, "strtoull() overflow detected.");
 					}
 				#else
 					itmp64 = RES_SHIFT;
@@ -2145,8 +2145,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					}
 				#endif
 				}
-				fprintf(stderr,"Recovered initial shift %llu\n",itmp64);
-				ASSERT(HERE, (itmp64>>32) == 0ull,"Shift must be < 2^32!");
+				fprintf(stderr,"Recovered initial shift %" PRIu64 "\n",itmp64);
+				ASSERT((itmp64>>32) == 0ull,"Shift must be < 2^32!");
 				GCHECK_SHIFT = itmp64;
 			}
 			mi64_shlc(c_uint64_ptr, c_uint64_ptr, (uint32)p, (uint32)GCHECK_SHIFT, j, (MODULUS_TYPE == MODULUS_TYPE_FERMAT));
@@ -2155,7 +2155,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			// Use mi64 routines to compute d[]*PRP_BASE and do ensuing equality check:
 			itmp64 = ((MODULUS_TYPE == MODULUS_TYPE_FERMAT) ? 3ull : (uint64)PRP_BASE);	// Fermat-mod uses PRP_BASE to store 2 for random-shift-offset scheme
 			c_uint64_ptr[j] = mi64_mul_scalar(c_uint64_ptr, itmp64, c_uint64_ptr, j);
-			ASSERT(HERE, c_uint64_ptr[j] == 0ull, "d[]*PRP_BASE result has unexpected carryout!");
+			ASSERT(c_uint64_ptr[j] == 0ull, "d[]*PRP_BASE result has unexpected carryout!");
 			// Need to (mod N) ... store modulus N in d[] doubles-array, which is freed up by above convert_res_FP_bytewise(d,...) call:
 			if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
 				// Loop rather than call to mi64_set_eq_scalar here, since need to set all elts = -1:
@@ -2169,11 +2169,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			for(i = 1; i < itmp64; i++) {
 				if(!c_uint64_ptr[j] && mi64_cmpult(c_uint64_ptr,d_uint64_ptr,j)) break;
 				cy = mi64_sub(c_uint64_ptr,d_uint64_ptr,c_uint64_ptr,j);	// c -= d, with d = 2^p-1
-				c_uint64_ptr[j] -= cy;	//ASSERT(HERE, cy == 0ull, "mi64_sub result has unexpected borrow!");
+				c_uint64_ptr[j] -= cy;	//ASSERT(cy == 0ull, "mi64_sub result has unexpected borrow!");
 			}
-			ASSERT(HERE, mi64_cmpult(c_uint64_ptr,d_uint64_ptr,j), "Gerbicz checkproduct reduction (mod 2^p-1) failed!");
+			ASSERT(mi64_cmpult(c_uint64_ptr,d_uint64_ptr,j), "Gerbicz checkproduct reduction (mod 2^p-1) failed!");
 			if(mi64_cmp_eq(e_uint64_ptr,c_uint64_ptr,j)) {
-				sprintf(cbuf,"At iteration %u, shift = %llu: Gerbicz check passed.\n",ihi,RES_SHIFT);
+				sprintf(cbuf,"At iteration %u, shift = %" PRIu64 ": Gerbicz check passed.\n",ihi,RES_SHIFT);
 				mlucas_fprint(cbuf,0);
 				// In G-check case we need b[] for that, thus skipped the d = b redundancy-copy ... do that now:
 				memcpy(d, b, nbytes);
@@ -2214,7 +2214,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			strcpy(cstr, RESTARTFILE);
 			strcat(cstr, cbuf);
 			if(rename(RESTARTFILE, cstr)) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: unable to rename %s restart file ==> %s ... skipping every-10M-iteration restart file archiving\n",WORKFILE,cstr);
+				snprintf(cbuf,STR_MAX_LEN*2,"ERROR: unable to rename %s restart file ==> %s ... skipping every-10M-iteration restart file archiving\n",WORKFILE,cstr);
 				fprintf(stderr,"%s",cbuf);
 			}
 		}	// ilo a multiple of 10 million?
@@ -2235,7 +2235,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				RESTARTFILE[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f');
 			}
 		} else {
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open restart file %s for write of checkpoint data.\n",RESTARTFILE);
+			snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open restart file %s for write of checkpoint data.\n",RESTARTFILE);
 			mlucas_fprint(cbuf,1);
 			/*
 			Don't want to assert here - asllow processing to continue, in case this is a transient failure-to-open.
@@ -2255,7 +2255,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					write_ppm1_savefiles(cstr,p,n,fp, itmp64, (uint8*)arrtmp,Res64,Res35m1,Res36m1, (uint8*)e_uint64_ptr,i1,i2,i3);
 					fclose(fp); fp = 0x0;
 				} else {
-					snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open Gerbicz-check savefile %s for write of checkpoint data.\n",cstr);
+					snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open Gerbicz-check savefile %s for write of checkpoint data.\n",cstr);
 					mlucas_fprint(cbuf,1);
 				}
 			}	// ihi a multiple of ITERS_BETWEEN_GCHECKS?
@@ -2305,7 +2305,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 
 	/* If Selftest mode... */
 	if(INTERACT) {
-		fprintf(stderr, "%u iterations of %s with FFT length %u = %u K, final residue shift count = %llu\n",timing_test_iters,PSTRING,n,kblocks,RES_SHIFT);
+		fprintf(stderr, "%u iterations of %s with FFT length %u = %u K, final residue shift count = %" PRIu64 "\n",timing_test_iters,PSTRING,n,kblocks,RES_SHIFT);
 		// If TEST_TYPE non-default (e.g. PRP for Mersennes), add text indicating that:
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE && TEST_TYPE == TEST_TYPE_PRP)
 			sprintf(cbuf,"PRP-%u ",PRP_BASE);
@@ -2314,15 +2314,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 
 		/* If Fermat number, make sure exponent a power of 2: */
 		if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
-			ASSERT(HERE, (p >> findex) == 1,"Require (p >> findex) == 1");
+			ASSERT((p >> findex) == 1,"Require (p >> findex) == 1");
 
 		if(timing_test_iters > AME_ITER_START) {
 			AME /= (timing_test_iters - AME_ITER_START);
-			fprintf(stderr, "%sRes64: %016llX. AvgMaxErr = %10.9f. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, AME, MME, VERSION);
+			fprintf(stderr, "%sRes64: %016" PRIX64 ". AvgMaxErr = %10.9f. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, AME, MME, VERSION);
 		} else {
-			fprintf(stderr, "%sRes64: %016llX. AvgMaxErr N/A. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, MME, VERSION);
+			fprintf(stderr, "%sRes64: %016" PRIX64 ". AvgMaxErr N/A. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, MME, VERSION);
 		}
-		/* MSVC/.NET incorrectly output these when using uint64 and %20llu format, so cast to double and print: */
+		/* MSVC/.NET incorrectly output these when using uint64 and %20" PRIu64 " format, so cast to double and print: */
 		fprintf(stderr, "Res mod 2^35 - 1 = %20.0f\n",(double)Res35m1);
 		fprintf(stderr, "Res mod 2^36 - 1 = %20.0f\n",(double)Res36m1);
 		/* If they are provided, check the Selfridge-Hurwitz residues: */
@@ -2382,7 +2382,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		*/
 		if(TEST_TYPE == TEST_TYPE_PRP && MODULUS_TYPE != MODULUS_TYPE_FERMAT)	// Applies only to mod-M(p) case,
 		{																		// Pepin-test and LL are handled in next clause.
-			ASSERT(HERE, ihi == p, "Gerbicz-check-modified PRP-test requires p mod-squarings!");
+			ASSERT(ihi == p, "Gerbicz-check-modified PRP-test requires p mod-squarings!");
 			/* Final PRP-residue which is *reported*, OTOH, is the standard Fermat-style (p-2)-squaring one.
 			That requires us to do 2 mod-divs of the 2-squares-too-many prp-residue r by the PRP-test base b.
 			If b divides r, we're good. Otherwise, need to find multiple of modulus m = 2^p-1 which needs to
@@ -2410,7 +2410,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			mmodb = twopmmodq64(p,itmp64);				// m"
 			// In the most common case PRP_BASE = 3, use that 2^6 == 1 (mod 9), thus 2^p == 2^(p mod 6) (mod 9)
 			if(PRP_BASE == 3)
-				ASSERT(HERE, mmodb == (1ull<<(p % 6)) % 9,"2^p == 2^(p mod 6) (mod 9) fails!");
+				ASSERT(mmodb == (1ull<<(p % 6)) % 9,"2^p == 2^(p mod 6) (mod 9) fails!");
 			// mmodb = (2^p-1) % base ... for reasons unknown, the macro MOD_SUB64 was not inlined properly under gdb
 			if(mmodb)
 				mmodb--;
@@ -2424,14 +2424,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		  #else
 			MUL_LOHI64(rmodb,i1, i2, i3);
 		  #endif
-			i2 %= itmp64;	ASSERT(HERE, i3 == 0ull, "K-multiplier needs 64-bit reduction (mod b^2)!");
+			i2 %= itmp64;	ASSERT(i3 == 0ull, "K-multiplier needs 64-bit reduction (mod b^2)!");
 			if(i2) i2 = itmp64 - i2;	// if(k) k = -r".mi" (mod b^2) = b^2 - r".mi" .
 			// i2 contains the needed multiplier k. Since ensuing quotient computation needs separate arrays
 			// for dividend and quotient, stash output of mi64_mul_scalar_add_vec2 in c[] and ensuing quotient back in arrtmp[]:
 			c_uint64_ptr[j] = mi64_mul_scalar_add_vec2(d_uint64_ptr,i2,arrtmp, c_uint64_ptr, j);
 			// Now short-div - allowing for the possibility of a carryout from above mi64_mul_scalar_add_vec2() call -
 			// by base and check that remainder 0. Note that we do want the quotient now, as that is our reside/base:
-			mi64_div(c_uint64_ptr, &itmp64, j+1,1, arrtmp,&rmodb);	ASSERT(HERE, rmodb == 0ull,"After short-div, R != 0 (mod B)");
+			mi64_div(c_uint64_ptr, &itmp64, j+1,1, arrtmp,&rmodb);	ASSERT(rmodb == 0ull,"After short-div, R != 0 (mod B)");
 			// And recompute the S-H residues:
 			res_SH(arrtmp,j,&Res64,&Res35m1,&Res36m1);
 			// Now that residue is standard Fermat-PRP-test one, check if == 1:
@@ -2465,7 +2465,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		char Res2048[513];
 		// Must save Res2048 before PRP cofactor test: https://github.com/primesearch/Mlucas/issues/25
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
-			for (int i = 31; i >= 0; i--) sprintf(Res2048+496-i*16, "%016llX", arrtmp[i]);
+			for (int i = 31; i >= 0; i--) sprintf(Res2048+496-i*16, "%016" PRIX64, arrtmp[i]);
 
 		// v21: PRP-CF: Cofactor-PRP test applies to primality/Fermat (which we follow by 1 additional mod-squaring
 		// to convert the base^((N-1)/2) Pepin/Euler-PRP residue to a base^(N-1) Fermat-PRP one) and PRP/Mersenne residues:
@@ -2491,7 +2491,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			if(isprime) {
 				if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) {
 					/*... this gets written both to file and to stdout, the latter irrespective of whether the run is in interactive mode...	*/
-					snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is a new FERMAT PRIME!!!\nPlease send e-mail to ewmayer@aol.com.\n",PSTRING);
+					snprintf(cbuf,STR_MAX_LEN*2, "%s is a new FERMAT PRIME!!!\nPlease send e-mail to ewmayer@aol.com.\n",PSTRING);
 					mlucas_fprint(cbuf,1);
 				}
 				else if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2501,16 +2501,16 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 							break;
 					}
 					if(knowns[i] != 0) {
-						snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is a known MERSENNE PRIME.\n",PSTRING);
+						snprintf(cbuf,STR_MAX_LEN*2, "%s is a known MERSENNE PRIME.\n",PSTRING);
 						mlucas_fprint(cbuf,(INTERACT || scrnFlag));	// Latter clause == "Echo output to stderr?"
 					} else {
 						// This gets written both to file and to stderr, the latter irrespective of whether the run is in interactive mode:
-						snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is a (probable) new MERSENNE PRIME!!!\nPlease send e-mail to ewmayer@aol.com and woltman@alum.mit.edu.\n",PSTRING);
+						snprintf(cbuf,STR_MAX_LEN*2, "%s is a (probable) new MERSENNE PRIME!!!\nPlease send e-mail to ewmayer@aol.com and woltman@alum.mit.edu.\n",PSTRING);
 						mlucas_fprint(cbuf,1);
 					}
 				}
 				else
-					ASSERT(HERE, 0, "Unsupported modulus type!");
+					ASSERT(0, "Unsupported modulus type!");
 			}
 			/*
 			The more likely scenario - it's not prime, so we form a 64-bit residue and write that.
@@ -2520,11 +2520,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				// Otherwise, write the 64-bit hex residue. As of v19, we write the old-style HRF-formatted result
 				// just to the exponent-specific logfile, and the server-expected JSON-formatted result to the results file:
 				// Note that Fermat primality tests are not submitted to server, so accordingly we slightly modify the output. More info: https://github.com/primesearch/Mlucas/pull/11
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is not prime. Program: E%s. Final residue shift count = %llu.\n",PSTRING,VERSION,RES_SHIFT);
+				snprintf(cbuf,STR_MAX_LEN*2, "%s is not prime. Program: E%s. Final residue shift count = %" PRIu64 ".\n",PSTRING,VERSION,RES_SHIFT);
 				mlucas_fprint(cbuf,1);
-				if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf_nowarn(cbuf,STR_MAX_LEN, "Selfridge-Hurwitz residues Res64,Res35m1,Res36m1 = %016llX,%11llu,%11llu.\n",Res64,Res35m1,Res36m1);
+				if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf(cbuf,STR_MAX_LEN*2, "Selfridge-Hurwitz residues Res64,Res35m1,Res36m1 = %016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 ".\n",Res64,Res35m1,Res36m1);
 				else {
-					snprintf_nowarn(cbuf,STR_MAX_LEN, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
+					snprintf(cbuf,STR_MAX_LEN*2, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
 					// v19: Finish with the JSON-formatted result line:
 					fp = mlucas_fopen(OFILE,"a");
 					if(fp) {
@@ -2534,7 +2534,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				mlucas_fprint(cbuf,1);
 			}
 		} else if (MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {	// Cofactor-PRP run:
-			snprintf_nowarn(cbuf,STR_MAX_LEN,"If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
+			snprintf(cbuf,STR_MAX_LEN*2,"If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
 			mlucas_fprint(cbuf,1);
 			// Write JSON-formatted result line to results file:
 			fp = mlucas_fopen(OFILE,"a");
@@ -2562,7 +2562,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					gm_time = localtime(&calendar_time);
 				strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S UTC",gm_time);
 				generate_JSON_report(0,p,n,0ull,NULL,timebuffer, B1,B2,gcd_str,s2_partial, cstr);	// cstr holds JSONified output
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
+				snprintf(cbuf,STR_MAX_LEN*2, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
 				mlucas_fprint(cbuf,0);
 				fp = mlucas_fopen(OFILE,"a");
 				if(fp) {
@@ -2596,7 +2596,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					if( PM1_S2_NBUF > ((uint32)(j*1024./(n>>7)) - 5) )
 						fprintf(stderr,"WARNING: User-specified maximum number of Stage 2 buffers may exceed %u MB of available RAM.\n",j);
 				}
-				ASSERT(HERE, PM1_S2_NBUF >= 24,"p-1 Stage 2 requires at least 24 residue-sized memory buffers!\n");
+				ASSERT(PM1_S2_NBUF >= 24,"p-1 Stage 2 requires at least 24 residue-sized memory buffers!\n");
 				// See if S2 restart file exists:
 				strcpy(cstr,RESTARTFILE); cstr[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f'); strcat(cstr, ".s2");
 				// If a regular (non-continuation, i.e. B2_start = B1) stage 2 and S2 restart file exists, read
@@ -2608,17 +2608,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 						// This snip reads the relocation-prime from the high byte of the nsquares field, byte 10 of the S2 savefile:
 						i = fgetc(fp);
 						if(!test_types_compatible(i, TEST_TYPE)) {
-							snprintf_nowarn(cbuf,STR_MAX_LEN, "%s: TEST_TYPE != fgetc(fp)\n",cstr); ASSERT(HERE,0,cbuf);
+							snprintf(cbuf,STR_MAX_LEN*2, "%s: TEST_TYPE != fgetc(fp)\n",cstr); ASSERT(0,cbuf);
 						}
 						if((i = fgetc(fp)) != MODULUS_TYPE) {
-							snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: %s: MODULUS_TYPE != fgetc(fp)\n",cstr); ASSERT(HERE,0,cbuf);
+							snprintf(cbuf,STR_MAX_LEN*2, "ERROR: %s: MODULUS_TYPE != fgetc(fp)\n",cstr); ASSERT(0,cbuf);
 						}
 						itmp64 = 0ull; 	for(j = 0; j < 8; j++) { i = fgetc(fp);	itmp64 += (uint64)i << (8*j); }
 						fclose(fp); fp = 0x0;
 						if(i != EOF)	// Needed to handle case where .s2 file was touched but ended up empty or < 10 bytes long
 							psmall = i;
 						itmp64 &= 0x00FFFFFFFFFFFFFFull;	// Mask off psmall to get stage 2 q of checkpoint data
-						fprintf(stderr,"Read iter = %llu and relocation-prime psmall = %u from savefile %s.\n",itmp64,psmall,cstr);
+						fprintf(stderr,"Read iter = %" PRIu64 " and relocation-prime psmall = %u from savefile %s.\n",itmp64,psmall,cstr);
 						// Now parse logfile to get proper B2 and validate corresponding B2_start vs B2/[psmall from .s2 file].
 						// Logfiles can be messy and include one or more aborted-restarts; we want the last B2_start-containing
 						// entry followed by a savefile-write entry, as inferred from presence of a "% complete" substring:
@@ -2627,9 +2627,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 						// If match "B2_start =", read bigstep D from match-line and infer relocation-prime psmall from D:
 						if(strlen(cbuf)) {
 							char_addr = strstr(cbuf,"B2_start = ");
-							B2_start = (uint64)strtoull(char_addr+11, &cptr, 10);	ASSERT(HERE, B2_start != -1ull, "strtoull() overflow detected.");
+							B2_start = (uint64)strtoull(char_addr+11, &cptr, 10);	ASSERT(B2_start != -1ull, "strtoull() overflow detected.");
 							char_addr = strstr(cbuf,"B2 = ");
-							B2 = (uint64)strtoull(char_addr+5, &cptr, 10);	ASSERT(HERE, B2 != -1ull, "strtoull() overflow detected.");
+							B2 = (uint64)strtoull(char_addr+5, &cptr, 10);	ASSERT(B2 != -1ull, "strtoull() overflow detected.");
 							char_addr = strstr(cbuf,"Bigstep = ");
 							if(char_addr) {
 								i = strtoul(char_addr+10, &endp, 10);
@@ -2644,7 +2644,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 							}
 							// Now compare the params from the restartfile vs those captured in the log:
 							if(psmall)
-								ASSERT(HERE, psmall == i && B2_start == B2/psmall, "Stage 2 params mismatch those captured in the .stat logfile!");
+								ASSERT(psmall == i && B2_start == B2/psmall, "Stage 2 params mismatch those captured in the .stat logfile!");
 							else
 								psmall = i;
 							// If stage 2 q of checkpoint >= B2, proceed directly to GCD:
@@ -2674,7 +2674,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					exit(1);
 				} else if(ierr) {
 					sprintf(cbuf,"p-1 stage 2 hit an unhandled error of type[%u] = %s! Aborting.",ierr,returnMlucasErrCode(ierr));
-					ASSERT(HERE,0,cbuf);
+					ASSERT(0,cbuf);
 				}
 				// If gcd_str non-empty on return, it means one of the intermediate S2 GCDs turned up a factor,
 				// prompting an early-return, In this case the S2 code will have reset B2 to reflect the actual interval run.
@@ -2694,7 +2694,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 					gm_time = localtime(&calendar_time);
 				strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S UTC",gm_time);
 				generate_JSON_report(0,p,n,0ull,NULL,timebuffer, B1,B2,gcd_str,s2_partial, cstr);	// cstr holds JSONified output
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
+				snprintf(cbuf,STR_MAX_LEN*2, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr);
 				mlucas_fprint(cbuf,0);
 				fp = mlucas_fopen(OFILE,"a");
 				if(fp){
@@ -2705,7 +2705,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			fprintf(stderr,"User specified low-mem run mode ... no stage 2.\n");
 		}
 	} else {
-		ASSERT(HERE, 0, "Unrecognized test type!");
+		ASSERT(0, "Unrecognized test type!");
 	}	/* endif(TEST_TYPE == TEST_TYPE_PRIMALITY) */
 
 	/*...If successful completion, delete the secondary restart files...save the primary in case it's a prime,
@@ -2728,7 +2728,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		*/
 		strcpy(cstr, RESTARTFILE); strcat(cstr, ".s2");
 		if(remove(cstr)) {
-			snprintf_nowarn(cbuf,STR_MAX_LEN,"INFO: Unable to remove stage 2 savefile %s.\n",cstr);
+			snprintf(cbuf,STR_MAX_LEN*2,"INFO: Unable to remove stage 2 savefile %s.\n",cstr);
 			mlucas_fprint(cbuf,1);
 		}
 		if(!s2_continuation) {
@@ -2743,7 +2743,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			// if primary missing/corrupt, rename secondary q[expo] ==> [p|f][expo].s1:
 			if(TEST_TYPE == TEST_TYPE_PM1 && !s2_continuation) {
 				if(rename(RESTARTFILE, cstr)) {
-					snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: unable to rename the p-1 stage 1 savefile %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr);
+					snprintf(cbuf,STR_MAX_LEN*2,"ERROR: unable to rename the p-1 stage 1 savefile %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr);
 					mlucas_fprint(cbuf,1);
 				}
 			} else if(TEST_TYPE == TEST_TYPE_PRIMALITY || TEST_TYPE == TEST_TYPE_PRP) {
@@ -2751,7 +2751,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				if(RESTARTFILE[0] == 'q') {
 					RESTARTFILE[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f');
 					if(rename(cstr, RESTARTFILE)) {
-						snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: Primary savefile missing/corrupt, but unable to rename the secondary %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr);
+						snprintf(cbuf,STR_MAX_LEN*2,"ERROR: Primary savefile missing/corrupt, but unable to rename the secondary %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr);
 						mlucas_fprint(cbuf,1);
 					}
 				} else if(remove(cstr))	// ...otherwise delete the secondary
@@ -2785,21 +2785,21 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 	if(!INTERACT) {
 		//*** IN THIS CASE MUST MAKE SURE CBUF,CSTR ONLY GET OVERWRITTEN ON ERROR ERROR, SINCE THEY CONTAIN THE SPLIT ASSIGNMENT! ***
 		if(split_curr_assignment) {
-			sprintf(ESTRING,"%llu",p);	// Set ESTRING here, as this bypasses the normal route for getting to GET_NEXT_ASSIGNMENT
-			ASSERT(HERE, TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: split_curr_assignment = TRUE, but TEST_TYPE != PM1.");
+			sprintf(ESTRING,"%" PRIu64,p);	// Set ESTRING here, as this bypasses the normal route for getting to GET_NEXT_ASSIGNMENT
+			ASSERT(TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: split_curr_assignment = TRUE, but TEST_TYPE != PM1.");
 		}
 
 		fp = mlucas_fopen(WORKFILE,"r");
 		if(!fp) {
 			sprintf(cbuf,"ERROR: unable to open %s file for reading.\n",WORKFILE);
-			ASSERT(HERE,0,cbuf);
+			ASSERT(0,cbuf);
 		}
 		/* Remove any WINI.TMP file that may be present: */
 		remove("WINI.TMP");
 		fq = mlucas_fopen("WINI.TMP", "w");
 		if(!fq) {
 			sprintf(cbuf, "Unable to open WINI.TMP file for writing.\n");
-			ASSERT(HERE,0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 	GET_NEXT:
@@ -2807,7 +2807,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 		i = 0;	// This counter tells how many *additional* assignments exist in worktodo
 		if(!fgets(in_line, STR_MAX_LEN, fp)) {
 			sprintf(cbuf, "ERROR: %s file not found at end of current-assignment processing\n", WORKFILE);
-			ASSERT(HERE,0,cbuf);
+			ASSERT(0,cbuf);
 		}
 		// v20.1.1: Parse all lines whose 1st non-WS char is alphabetic;
 		char_addr = in_line;	j = 0;
@@ -2818,8 +2818,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 
 		// Look for m in first eligible assignment; for F[m], need to also look for 2^m in case assignment is in KBNC format:
 		if(!strstr(in_line, ESTRING) && !(MODULUS_TYPE == MODULUS_TYPE_FERMAT && strstr(in_line, BIN_EXP)) ) {
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: Current exponent %s not found in line 1 of %s file - quitting.\n", ESTRING, WORKFILE);
-			ASSERT(HERE,0,cbuf);
+			snprintf(cbuf,STR_MAX_LEN*2, "ERROR: Current exponent %s not found in line 1 of %s file - quitting.\n", ESTRING, WORKFILE);
+			ASSERT(0,cbuf);
 		} else {
 			/* If we just finished the TF or p-1 preprocessing step of an LL or PRP test,
 			update the current-assignment line to reflect that and write it out: */
@@ -2828,7 +2828,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 				if(TEST_TYPE == TEST_TYPE_TF) {
 					/* Factor depth assumed to follow the first comma in in_line: */
 					char_addr = strstr(char_addr, ",");
-					ASSERT(HERE, char_addr != 0x0,"Null char_addr");
+					ASSERT(char_addr != 0x0,"Null char_addr");
 					sprintf(++char_addr, "%u", TF_BITS);
 					fputs(in_line, fq);
 				}
@@ -2843,16 +2843,16 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			} else if(stristr(in_line, "pminus1")) {
 				// If current p-1 assignment found a factor and resulted from splitting of a PRP/LL assignment -
 				// note that split_curr_assignment == TRUE only at time of the initial splitting - delete them both:
-				ASSERT(HERE, TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: current assignment is Pminus1=, but TEST_TYPE != PM1.");
+				ASSERT(TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: current assignment is Pminus1=, but TEST_TYPE != PM1.");
 				if(strlen(gcd_str) != 0) {	// Found a factor?
-					char_addr = strstr(in_line, "=");	ASSERT(HERE,char_addr != 0x0,"Malformed assignment!");
+					char_addr = strstr(in_line, "=");	ASSERT(char_addr != 0x0,"Malformed assignment!");
 					char_addr++;
 					if(is_hex_string(char_addr, 32)) {
 						strncpy(aid,char_addr,32);
 					} else if(STREQN_NOCASE(char_addr,"n/a",3)) {
 						strncpy(aid,char_addr, 3);
 					} else {
-						snprintf_nowarn(cbuf,STR_MAX_LEN,"INFO: Assignment \"%s\" lacks a valid assignment ID ... proceeding anyway.\n",in_line);
+						snprintf(cbuf,STR_MAX_LEN*2,"INFO: Assignment \"%s\" lacks a valid assignment ID ... proceeding anyway.\n",in_line);
 						mlucas_fprint(cbuf,1);
 						aid[0] = '\0';	// This guarantees that the strstr(in_line,aid) part of on the next-assignment search below succeeds.
 					}
@@ -2892,13 +2892,13 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
 			fp = mlucas_fopen(WORKFILE,"w");
 			if(!fp) {
 				sprintf(cbuf,"ERROR: unable to open %s file for writing.\n", WORKFILE);
-				ASSERT(HERE,0,cbuf);
+				ASSERT(0,cbuf);
 			}
 
 			fq = mlucas_fopen("WINI.TMP", "r");
 			if(!fq) {
 				sprintf(cbuf,"Unable to open WINI.TMP file for reading.\n");
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 			while(fgets(in_line, STR_MAX_LEN, fq)) {
 				fputs(in_line, fp);
@@ -2954,21 +2954,21 @@ void 	Mlucas_init(void)
 
 	/* Set min. exponent (in terms of power of 2) that can be tested: */
 	/* Check that the purported min. FFT length is actually supported: */
-	ASSERT(HERE, get_fft_radices(MIN_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MIN_FFT_LENGTH_IN_K, 0) == 0");
+	ASSERT(get_fft_radices(MIN_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MIN_FFT_LENGTH_IN_K, 0) == 0");
 	n = (MIN_FFT_LENGTH_IN_K << 10);
 	/* Make sure N didn't overflow */
-	ASSERT(HERE, (n >> 10) == MIN_FFT_LENGTH_IN_K,"Require (n >> 10) == MIN_FFT_LENGTH_IN_K");
+	ASSERT((n >> 10) == MIN_FFT_LENGTH_IN_K,"Require (n >> 10) == MIN_FFT_LENGTH_IN_K");
 	PMIN = 2*n;	/* 2 bits per input is about the smallest we can test without getting nonzero-carry errors */
 
 	/* Set max. exponent (in terms of power of 2) that can be tested: */
 	/* Check that the purported max. FFT length is actually supported: */
-	ASSERT(HERE, get_fft_radices(MAX_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MAX_FFT_LENGTH_IN_K, 0) == 0");
+	ASSERT(get_fft_radices(MAX_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MAX_FFT_LENGTH_IN_K, 0) == 0");
 	n = (MAX_FFT_LENGTH_IN_K << 10);
 	/* Make sure N didn't overflow */
-	ASSERT(HERE, (n >> 10) == MAX_FFT_LENGTH_IN_K,"Require (n >> 10) == MAX_FFT_LENGTH_IN_K");
+	ASSERT((n >> 10) == MAX_FFT_LENGTH_IN_K,"Require (n >> 10) == MAX_FFT_LENGTH_IN_K");
 	PMAX = 1.05*given_N_get_maxP(n);	// Allow same wiggle room here as in ernstMain
 
-	ASSERT(HERE, PMAX > PMIN,"Require PMAX > PMIN");
+	ASSERT(PMAX > PMIN,"Require PMAX > PMIN");
 
 #if INCLUDE_TF
 	/* Simple self-tester for sieve factoring routines: */
@@ -2976,7 +2976,7 @@ void 	Mlucas_init(void)
 	if(test_fac() != 0)
 	{
 		sprintf(cbuf, "Mlucas_init : Trial-factoring self-test failed.\n");
-		fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+		fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 	}
 #endif
 #if 0	// v20: Use GMP GCD, own-rolled O(n*(log n)^2) one simply not in the cards.
@@ -2985,7 +2985,7 @@ void 	Mlucas_init(void)
 	if(test_gcd() != 0)
 	{
 		sprintf(cbuf, "Mlucas_init : GCD test failed.\n");
-		fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+		fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 	}
 #endif
 }
@@ -3038,16 +3038,16 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 	uint64 nbits, itmp64;
 	 int64 retval = -1;	// Make this signed to ease "not yet set?" check
 #ifdef USE_FGT61
-	ASSERT(HERE,0,"shift_word() needs to be modified to support FGT!");
+	ASSERT(0,"shift_word() needs to be modified to support FGT!");
 #endif
 	if(n != nsave || p != psave) {
 		first_entry = TRUE;	for(j = 0; j < (n>>6); j++) { BIGWORD_BITMAP[j] = 0ull; }	// Need to clear bitmap in case of multi-FFT-length run
 		bw = p%n; sw = n-bw;
 		/* If Fermat number, make sure exponent a power of 2: */
 		if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) {
-			ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE,"Require TRANSFORM_TYPE == RIGHT_ANGLE");
+			ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE,"Require TRANSFORM_TYPE == RIGHT_ANGLE");
 			findex = trailz64(p);
-			ASSERT(HERE, (p >> findex) == 1,"Require (p >> findex) == 1");
+			ASSERT((p >> findex) == 1,"Require (p >> findex) == 1");
 			/* For Fermat-mod, only need IBDWT weights table if it's a non-power-of-2-length transform, in which
 			case the table has {nwt = odd part of N} distinct elements. Avoid if() logic related to power-of-2-or-not
 			by initing a single DWT weight = 1.0 in the power-of-2 case and = 2^((j%nwt)/n) otherwise:
@@ -3056,7 +3056,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 			sw_div_n = sw*nwt/n;
 		}
 		else
-			ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER,"Require TRANSFORM_TYPE == REAL_WRAPPER");
+			ASSERT(TRANSFORM_TYPE == REAL_WRAPPER,"Require TRANSFORM_TYPE == REAL_WRAPPER");
 
 		/* Vector length a power of 2? */
 		pow2_fft = (n >> trailz32(n)) == 1;
@@ -3072,7 +3072,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 	// than I'd like, likely due to cache impacts of doing random-word lookups in the resulting 128kB and 64kB BIGWORD* arrays.
 	// Also had the "adjusting..." printfs enabled during the timing tests, 0 such adjustments needed for 10^9 random-shifts:
 	if(!first_entry) {
-	//	ASSERT(HERE, BIGWORD_BITMAP != 0x0 && BIGWORD_NBITS != 0x0, "BIGWORD_BITMAP and BIGWORD_NBITS arrays not alloc'ed!");
+	//	ASSERT(BIGWORD_BITMAP != 0x0 && BIGWORD_NBITS != 0x0, "BIGWORD_BITMAP and BIGWORD_NBITS arrays not alloc'ed!");
 		// Divide [shift] by the average bits per word to get a quick estimate of which word contains the corresponding bit:
 		j = shift*words_per_bit;	w64 = j>>6; mod64 = j&63;
 		// Then exactly compute the bitcount at the resulting word, by adding the BIGWORD_NBITS-array-stored exact
@@ -3086,7 +3086,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 		// Can gain a few % speed by commenting out this correction-step code, but even though I've encountered
 		// no cases where it's used in my (admittedly quite limited) testing, better safe than sorry:
 		if(shift < ii) {
-		//	printf("shift[%llu] < ii [%u] ... adjusting downward.\n",shift,ii);
+		//	printf("shift[%" PRIu64 "] < ii [%u] ... adjusting downward.\n",shift,ii);
 			while(shift < ii) {
 				if(--j < 0) {	// Note j is signed
 					j += 64;	w64 = j>>6; mod64 = j&63;	// Go to next-lower word of BIGWORD_BITMAP
@@ -3097,7 +3097,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 				ii -= curr_wd_bits;
 			}
 		} else if(shift >= (ii + curr_wd_bits) ) {
-		//	printf("shift[%llu] >= (ii + curr_wd_bits) [%u] ... adjusting upward.\n",shift,(ii + curr_wd_bits));
+		//	printf("shift[%" PRIu64 "] >= (ii + curr_wd_bits) [%u] ... adjusting upward.\n",shift,(ii + curr_wd_bits));
 			while(shift >= (ii + curr_wd_bits) ) {
 				if(++j > 63) {
 					j -= 64;	w64 = j>>6; mod64 = j&63;	// Go to next-higher word of BIGWORD_BITMAP
@@ -3156,11 +3156,11 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 
 	first_entry = FALSE;
 	psave = p; nsave = n; bits_per_word = (double)p/n; words_per_bit = 1.0/bits_per_word;
-	ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!");
-	ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
-	ASSERT(HERE,TRANSFORM_TYPE,"TRANSFORM_TYPE not set!");
-	ASSERT(HERE,TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!");
-	ASSERT(HERE,p > shift,"Specified shift count out of range!");
+	ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!");
+	ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
+	ASSERT(TRANSFORM_TYPE,"TRANSFORM_TYPE not set!");
+	ASSERT(TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!");
+	ASSERT(p > shift,"Specified shift count out of range!");
 
 	nbits = 0;	/* Total bits accumulated so far in the residue words processed	*/
 
@@ -3186,7 +3186,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 				if(retval < 0) {	// retval has not yet been set
 					curr_wd_bits = shift - (nbits - bits[ii]);	retval = ((uint64)j<<8) + curr_wd_bits;
 					cy = cy_in;
-				//	printf("Hit target bit %llu in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1);	ASSERT(HERE, curr_wd_bits <= bits[ii]-1,"GAH!");
+				//	printf("Hit target bit %" PRIu64 " in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1);	ASSERT(curr_wd_bits <= bits[ii]-1,"GAH!");
 				}
 			#ifdef USE_AVX512
 				j1 = (j & mask03) + br16[j&15];
@@ -3214,7 +3214,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 	}
 	else
 	{
-		ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE, "Invalid or uninited TRANSFORM_TYPE!");
+		ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE, "Invalid or uninited TRANSFORM_TYPE!");
 		curr_wd64 = -1; curr_bit64 = 0;
 	  for(i = 0; i < 2; i++)	// Two stride-2 loops to cover even and odd-indexed array elements, respectively:
 	  {
@@ -3236,7 +3236,7 @@ uint64 	shift_word(double a[], int n, const uint64 p, const uint64 shift, const
 				if(retval < 0) {	// retval has not yet been set
 					curr_wd_bits = shift - (nbits - bits[ii]);	retval = ((uint64)j<<8) + curr_wd_bits;
 					cy = cy_in;
-				//	printf("Hit target bit %llu in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1);	ASSERT(HERE, curr_wd_bits <= bits[ii]-1,"GAH!");
+				//	printf("Hit target bit %" PRIu64 " in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1);	ASSERT(curr_wd_bits <= bits[ii]-1,"GAH!");
 				}
 			#ifdef USE_AVX512
 				j1 = (j & mask03) + br16[j&15];
@@ -3300,35 +3300,35 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[]
 	uint32 kblocks = n>>10, npad = n + ( (n >> DAT_BITS) << PAD_BITS );	// npad = length of padded data array
 	uint64 itmp64, Res35m1, Res36m1;	// Res64 from original PRP passed in via pointer; these are locally-def'd
 	cbuf[0] = '\0';
-	snprintf_nowarn(cbuf,STR_MAX_LEN,"Suyama-PRP on cofactors of %s: using FFT length %uK = %u 8-byte floats.\n",PSTRING,kblocks,n);//	strcat(cbuf,cstr);
-//	sprintf(cstr, " this gives an average %20.15f bits per digit\n",1.0*p/n);	strcat(cbuf,cstr);
+	snprintf(cbuf,STR_MAX_LEN*2,"Suyama-PRP on cofactors of %s: using FFT length %uK = %u 8-byte floats.\n",PSTRING,kblocks,n);//	strcat(cbuf,cstr);
+	// sprintf(cstr, " this gives an average %20.15f bits per digit\n",1.0*p/n);	strcat(cbuf,cstr);
 	mlucas_fprint(cbuf,1);
 	// Pepin-test output = P, vs Mersenne-PRP (type 1) residue = A; thus only need an initial mod-squaring for:
 	// the former. Compute Fermat-PRP residue [A] from Euler-PRP (= Pepin-test) residue via a single mod-squaring:
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) {
-		ASSERT(HERE, ilo == p-1, "Fermat-mod cofactor-PRP test requires p-1 mod-squarings!");
-		snprintf_nowarn(cbuf,STR_MAX_LEN,"Doing one mod-%s squaring of iteration-%u residue [Res64 = %016llX] to get Fermat-PRP residue\n",PSTRING,ilo,*Res64);
+		ASSERT(ilo == p-1, "Fermat-mod cofactor-PRP test requires p-1 mod-squarings!");
+		snprintf(cbuf,STR_MAX_LEN*2,"Doing one mod-%s squaring of iteration-%u residue [Res64 = %016" PRIX64 "] to get Fermat-PRP residue\n",PSTRING,ilo,*Res64);
 		mlucas_fprint(cbuf,1);
 		ilo = 0;	ihi = ilo+1;	// Have checked that savefile residue is for a complete PRP test, so reset iteration counter
 		BASE_MULTIPLIER_BITS[0] = 0ull;
 /*A*/	ierr = func_mod_square(a, (int*)ci, n, ilo,ihi, 0ull, p, scrnFlag, tdiff, TRUE, 0x0);
 		convert_res_FP_bytewise(a, (uint8*)ci, n, p, Res64, &Res35m1, &Res36m1);	// Overwrite passed-in Pepin-Res64 with Fermat-PRP one
-		snprintf_nowarn(cbuf,STR_MAX_LEN,"MaxErr = %10.9f\n",MME); mlucas_fprint(cbuf,1);
+		snprintf(cbuf,STR_MAX_LEN,"MaxErr = %10.9f\n",MME); mlucas_fprint(cbuf,1);
 	} else if (MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {	// Mersenne PRP-CF doesn't have the Res35m1 or Res36m1 values passed in,
 		res_SH(ci,n,&itmp64,&Res35m1,&Res36m1);			// so we refresh these; see https://github.com/primesearch/Mlucas/issues/27
 	}
 	if(ierr) {
-		snprintf_nowarn(cbuf,STR_MAX_LEN,"Error of type[%u] = %s in mod-squaring ... aborting\n",ierr,returnMlucasErrCode(ierr));
-		mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+		snprintf(cbuf,STR_MAX_LEN*2,"Error of type[%u] = %s in mod-squaring ... aborting\n",ierr,returnMlucasErrCode(ierr));
+		mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 	}
-	sprintf(cbuf, "Fermat-PRP residue (A)     = 0x%016llX,%11llu,%11llu\n",*Res64,Res35m1,Res36m1);
+	sprintf(cbuf, "Fermat-PRP residue (A)     = %#016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 "\n",*Res64,Res35m1,Res36m1);
 	mlucas_fprint(cbuf,1);
 	j = (p+63)>>6;	// j = uint64 vector length; Omit leading '1' bit in Fermat case since PRP-residue only has that set if a Fermat prime
 	mi64_set_eq(ai,ci,j);	// Copy packed-bit result back into low ceiling(p/64) bytes of A-vec (treated as a uint64 array)
 	// Compute "prime-factor product residue" [B] from Euler-PRP (= Pepin-test) residue ... first init bitwise mul-by-base array = F, i.e. storing product of known small-prime factors:
 	if(!nfac) {
 		sprintf(cbuf, "Cofactor-PRP test requires one or more known factors!");
-		mlucas_fprint(cbuf,0); ASSERT(HERE, 0, cbuf);
+		mlucas_fprint(cbuf,0); ASSERT(0, cbuf);
 	}
 	BASE_MULTIPLIER_BITS[0] = 1ull;	lenf = 1;
 	// Multiply each known-factor with current partial product of factors.
@@ -3340,8 +3340,8 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[]
 		mi64_mul_vector(BASE_MULTIPLIER_BITS,lenf, KNOWN_FACTORS+i,k, curr_fac,&lenf);
 		mi64_set_eq(BASE_MULTIPLIER_BITS,curr_fac,lenf);
 	}
-	ASSERT(HERE, (i>>2) == nfac, "Number of known-factors mismatch!");
-	ASSERT(HERE, lenf <= 20, "Product of known-factors too large to fit into curr_fac[]!");
+	ASSERT((i>>2) == nfac, "Number of known-factors mismatch!");
+	ASSERT(lenf <= 20, "Product of known-factors too large to fit into curr_fac[]!");
 	for(i = 0; i < lenf; i++) { curr_fac[i] = 0ull; }	// Re-zero the elts of curr_fac[] used as tmps in above loop
 	fbits = (lenf<<6) - mi64_leadz(BASE_MULTIPLIER_BITS, lenf);
 	// Now that have F stored in BASE_MULTIPLIER_BITS array, do powmod to get B = base^(F-1) (mod N):
@@ -3349,33 +3349,33 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[]
 	for(i = 0; i < npad; i++) { b[i] = 0; }	// Zero the elements of the floating-point array b[]
 	/****** Note: For Fermat *cofactor* PRP check we use a PRP assignment (not Pepin-test, though we need that residue as our
 	input), meaning that PRP_BASE = 3, not the speecial value 2 it has for residue-shift purposes in Pepin test mode: ******/
-	b[0] = PRP_BASE;	ASSERT(HERE, PRP_BASE < (1 << (uint32)ceil(1.0*p/n)), "PRP_BASE out of range!");
+	b[0] = PRP_BASE;	ASSERT(PRP_BASE < (1 << (uint32)ceil(1.0*p/n)), "PRP_BASE out of range!");
 	ilo = 0;	ihi = fbits-1;	// LR modpow; init b[0] = PRP_BASE takes cares of leftmots bit
 	RES_SHIFT = 0ull;	// Zero the residue-shift so as to not have to play games with where-to-inject-the-initial-seed
 	mi64_brev(BASE_MULTIPLIER_BITS,ihi);	// bit-reverse low [ihi] bits of BASE_MULTIPLIER_BITS:
 /*B*/	ierr = func_mod_square(b, (int*)ci, n, ilo,ihi, 0ull, p, scrnFlag, tdiff, TRUE, 0x0);
 	if(ierr) {
-		snprintf_nowarn(cbuf,STR_MAX_LEN,"Error of type[%u] = %s on iteration %u of mod-squaring chain ... aborting\n",ierr,returnMlucasErrCode(ierr),ROE_ITER);
-		mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+		snprintf(cbuf,STR_MAX_LEN*2,"Error of type[%u] = %s on iteration %u of mod-squaring chain ... aborting\n",ierr,returnMlucasErrCode(ierr),ROE_ITER);
+		mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 	}
 	sprintf(cbuf,"Processed %u bits in binary modpow; MaxErr = %10.9f\n",ihi,MME);
 	convert_res_FP_bytewise(b, (uint8*)ci, n, p, &itmp64, &Res35m1, &Res36m1);	// Res64 reserved for Fermat-PRP result; use itmp64 here
-	sprintf(cstr, "%u^(F-1) residue (B)        = 0x%016llX,%11llu,%11llu\n",PRP_BASE,itmp64,Res35m1,Res36m1);
+	sprintf(cstr, "%u^(F-1) residue (B)        = %#016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 "\n",PRP_BASE,itmp64,Res35m1,Res36m1);
 	strcat(cbuf,cstr);	mlucas_fprint(cbuf,1);
-	ASSERT(HERE, j = (p+63)>>6,"uint64 vector length got clobbered!");
+	ASSERT(j = (p+63)>>6,"uint64 vector length got clobbered!");
 	mi64_set_eq(bi,ci,j);	// Copy packed-bit result into low j limbs of B-vec (treated as a uint64 array)
 	itmp64 = mi64_sub(ai,bi, ai,j);
 	// If result < 0, need to add Modulus - for N = Fm,Mp this means +-1 in LSW, respectively.
 	// For Fermat case, the borrow out of the high limb in the preceding vector-sub is canceled by the
 	// leading binary '1' in F[m]; in the Mersenne case, need to explicitly add 2^(p%64) to high limb:
 	if(itmp64) {
-		ASSERT(HERE, itmp64 == 1ull,"Carryout = 1 expected!");
+		ASSERT(itmp64 == 1ull,"Carryout = 1 expected!");
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
 			itmp64 = mi64_sub_scalar(ai,1ull, ai,j);
 			ai[j-1] += 1ull << (p&63);
 		} else {
 			itmp64 = mi64_add_scalar(ai,1ull, ai,j);
-		}	ASSERT(HERE, itmp64 == 0ull,"Carryout = 0 expected!");
+		}	ASSERT(itmp64 == 0ull,"Carryout = 0 expected!");
 	}
 	// B-array again free, re-use in uint64-cast form to compute C = Fm/F and (A-B) mod C:
 	// Compute Modulus ... note mi64-vecs have no cache-oriented element padding:
@@ -3394,23 +3394,23 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[]
 	BASE_MULTIPLIER_BITS[lenf-1] += 1ull << (fbits-1);	// Restore leftmost bit ...
 	BASE_MULTIPLIER_BITS[     0] += 1ull;	// ... and add 1 to recover F; no chance of a carryout here
 	// Since F << N, use Mont-mul-div for C - quotient overwrites N, no rem-vec needed, just verify that F is in fact a divisor:
-	ASSERT(HERE, 1 == mi64_div(bi,BASE_MULTIPLIER_BITS, j,lenf, ci,0x0), "C = N/F should have 0 remainder!");	// C in ci[]
+	ASSERT(1 == mi64_div(bi,BASE_MULTIPLIER_BITS, j,lenf, ci,0x0), "C = N/F should have 0 remainder!");	// C in ci[]
 	j -= (MODULUS_TYPE == MODULUS_TYPE_FERMAT);	// In Fermat case, undo the above j++ used to insert the leading bit in F[m]
 	i = j;	j = mi64_getlen(ci, j);	// *** Apr 2022 bug: don't add extra limb for Fermat-case to i here since (A-B) < N ***
 // R = (A - B) mod C in B-array (bi[]); store Q = (A - B)/C in curr_fac[] in case want to remultiply and verify Q*C + R = (A - B):
-	sprintf(cbuf,"(A - B) Res64 = 0x%016llX, C Res64 = 0x%016llX\n",ai[0],ci[0]);
+	sprintf(cbuf,"(A - B) Res64 = %#016" PRIX64 ", C Res64 = %#016" PRIX64 "\n",ai[0],ci[0]);
 	mlucas_fprint(cbuf,1);
 	mi64_div_binary(ai,ci, i,j, curr_fac,(uint32 *)&k, bi);	// On return, k has quotient length; curr_fac[] = quo, bi[] = rem
-	snprintf_nowarn(cbuf,STR_MAX_LEN,"(A - B)/C: Quotient = %s, Remainder Res64 = 0x%016llX\n",&cstr[convert_mi64_base10_char(cstr,curr_fac,k,0)],bi[0]);
+	snprintf(cbuf,STR_MAX_LEN*2,"(A - B)/C: Quotient = %s, Remainder Res64 = %#016" PRIX64 "\n",&cstr[convert_mi64_base10_char(cstr,curr_fac,k,0)],bi[0]);
 	mlucas_fprint(cbuf,1);
 	// For 1-word quotient q, double-check binary-div result by computing (q*denominator + r) and comparing vs numerator:
   #if 0	/*** May 2022: This overwrites ci[], which hoses the is-cofactor-a-prime-power GCD() below ***/
 	if(k == 1) {
-		ASSERT(HERE, 0 == mi64_mul_scalar_add_vec2(ci, curr_fac[0], bi, ci, i), "Unexpected carryout!");
-		ASSERT(HERE, 1 == mi64_cmp_eq(ai,ci,i), "Q*C + R = (A - B) check fails!");
+		ASSERT(0 == mi64_mul_scalar_add_vec2(ci, curr_fac[0], bi, ci, i), "Unexpected carryout!");
+		ASSERT(1 == mi64_cmp_eq(ai,ci,i), "Q*C + R = (A - B) check fails!");
 	}
   #endif
-	snprintf_nowarn(cbuf,STR_MAX_LEN,"Suyama Cofactor-PRP test of %s",PSTRING);
+	snprintf(cbuf,STR_MAX_LEN*2,"Suyama Cofactor-PRP test of %s",PSTRING);
 	// Base-2 log of cofactor = lg(Fm/F) = lg(Fm) - lg(F) ~= 2^m - lg(F). 2^m stored in p, sub lg(F) in loop below:
 	double lg_cof = p,lg_fac,log10_2 = 0.30102999566398119521;	// Use lg_fac to store log2 of each factor as we recompute it
 	for(i = 0; KNOWN_FACTORS[i] != 0ull; i += 4) {
@@ -3428,7 +3428,7 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[]
 		sprintf(cbuf,"This cofactor is PROBABLE PRIME [PRP%u].\n",i);	mlucas_fprint(cbuf,1);
 	} else {
 		res_SH(bi,j,&itmp64,&Res35m1,&Res36m1);	// Res64 reserved for Fermat-PRP result; use itmp64 here
-		sprintf(cstr," with FFT length %u = %u K:\n\t(A - B) mod C has Res64,35m1,36m1: 0x%016llX,%11llu,%11llu.\n",n,kblocks,itmp64,Res35m1,Res36m1);
+		sprintf(cstr," with FFT length %u = %u K:\n\t(A - B) mod C has Res64,35m1,36m1: %#016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 ".\n",n,kblocks,itmp64,Res35m1,Res36m1);
 		strcat(cbuf,cstr);	mlucas_fprint(cbuf,1);
 		/* Compute gcd(A - B,C) [cf. Phil Moore post: https://mersenneforum.org/showpost.php?p=210599&postcount=67]
 			"Take the GCD of the difference of these two residues (A - B) with C. If the GCD is equal to 1,
@@ -3956,9 +3956,9 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 	// v18: Enable access to argc/argv outside main():
 	global_argv = argv;
 
-	ASSERT(HERE, (MersVec[numTest-1].fftLength != 0) &&  (MersVec[numTest].fftLength == 0), "numTest != MersVec allocated size!");
-	ASSERT(HERE, (MvecPRP[numTest-1].fftLength != 0) &&  (MvecPRP[numTest].fftLength == 0), "numTest != MvecPRP allocated size!");
-	ASSERT(HERE, (FermVec[numFerm-1].fftLength != 0) &&  (FermVec[numFerm].fftLength == 0), "numFerm != FermVec allocated size!");
+	ASSERT((MersVec[numTest-1].fftLength != 0) &&  (MersVec[numTest].fftLength == 0), "numTest != MersVec allocated size!");
+	ASSERT((MvecPRP[numTest-1].fftLength != 0) &&  (MvecPRP[numTest].fftLength == 0), "numTest != MvecPRP allocated size!");
+	ASSERT((FermVec[numFerm-1].fftLength != 0) &&  (FermVec[numFerm].fftLength == 0), "numFerm != FermVec allocated size!");
 
 	/*...check that various data types are of the assumed length
 	and do some other basic sanity checks:
@@ -3979,7 +3979,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 	nargs = 1;
 	while(argv[nargs])
 	{
-		strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+		strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 		if(nargs > argc) {	// == no longer applies since e.g. -prp requires no numeric arg and can come last:
 			fprintf(stderr, "*** ERROR: Unterminated command-line option or malformed argument.\n");
 			print_help();
@@ -3996,7 +3996,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		if(STREQ(stFlag, "-s"))
 		{
 			selfTest = TRUE;
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			for(;;) {
 				if(STREQ(stFlag, "a") || STREQ(stFlag, "all")) {	/* all, which really means all the non-Huge-and-larger sets */
 					start = 0; finish = numTeensy + numTiny + numSmall + numMedium + numLarge;
@@ -4041,11 +4041,11 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		else if(STREQ(stFlag, "-maxalloc"))	// maxalloc arg is max %-of-available-mem to use
 		{
-			ASSERT(HERE, nbufSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!");
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			ASSERT(nbufSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!");
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			darg = strtod(stFlag,&cptr);
 			// Must be > 0:
-			ASSERT(HERE, (darg > 0), "maxalloc (%%-of-available-mem to use) argument must be > 0 ... halting.");
+			ASSERT((darg > 0), "maxalloc (%%-of-available-mem to use) argument must be > 0 ... halting.");
 			// Max-%-of-RAM-to-use currently stored in MAX_RAM_USE ... later will multiply by (available system RAM in MB):
 			MAX_RAM_USE = darg;
 			maxAllocSet = TRUE;
@@ -4054,11 +4054,11 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		else if(STREQ(stFlag, "-pm1_s2_nbuf"))	// pm1_s2_nbuf arg is max %-of-available-mem to use
 		{
-			ASSERT(HERE, maxAllocSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!");
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			ASSERT(maxAllocSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!");
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			darg = strtod(stFlag,&cptr);
 			// Must be > 0:
-			ASSERT(HERE, (darg > 0), "pm1_s2_nbuf argument must be integer ... halting.");
+			ASSERT((darg > 0), "pm1_s2_nbuf argument must be integer ... halting.");
 			// Max-%-of-RAM-to-use currently stored in MAX_RAM_USE ... later will convert to floating-fraction and multiply by (available system RAM in MB):
 			PM1_S2_NBUF = darg;
 			nbufSet = TRUE;
@@ -4067,16 +4067,16 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		else if(STREQ(stFlag, "-iters"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			i64arg = atol(stFlag);
 			// Must be < 2^32:
-			ASSERT(HERE, !(i64arg>>32), "#iters argument must be < 2^32 ... halting.");
+			ASSERT(!(i64arg>>32), "#iters argument must be < 2^32 ... halting.");
 			iters = (uint32)i64arg;
 		}
 
 		else if(STREQ(stFlag, "-fft") || STREQ(stFlag, "-fftlen"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			// v20: default is still integer-FFT-length in Kdoubles, but add support for [float]M,
 			// where floating-point arg must be exactly representable, such that [float]*2^10 is integer:
 			i64arg = -1ull;
@@ -4087,22 +4087,22 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 				else if(STREQ(cptr,"K"))
 					i64arg = darg;
 				else {
-					ASSERT(HERE, 0, "The only non-numeric suffixes allowed for the argument to -fft are K and M");
+					ASSERT(0, "The only non-numeric suffixes allowed for the argument to -fft are K and M");
 				}
 			} else
 				i64arg = darg;
 			// Must be in range [MIN_FFT_LENGTH_IN_K,MAX_FFT_LENGTH_IN_K], def'd in Mdata.h:
 			if(i64arg < MIN_FFT_LENGTH_IN_K || i64arg > MAX_FFT_LENGTH_IN_K) {
-				sprintf(cbuf  , "ERROR: FFT-length argument = %llu, must be in range [%u,%u]K\n",i64arg,MIN_FFT_LENGTH_IN_K,MAX_FFT_LENGTH_IN_K);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				sprintf(cbuf  , "ERROR: FFT-length argument = %" PRIu64 ", must be in range [%u,%u]K\n",i64arg,MIN_FFT_LENGTH_IN_K,MAX_FFT_LENGTH_IN_K);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			fftlen = (uint32)i64arg;	// Note this is the REAL-vector FFT length
 			if((i = get_fft_radices(fftlen, 0, 0x0, 0x0, 0)) != 0) {
 				sprintf(cbuf  , "ERROR: FFT length %d K not available.\n",fftlen);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			// If user has supplied a set of complex-FFT radices, their product must equal half the real-FFT length:
-			if(rad_prod) { ASSERT(HERE, (rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!"); }
+			if(rad_prod) { ASSERT((rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!"); }
 		}
 
 		/* v19.1: Enhance the -radset flag to take either an index into the big table in get_fft_radices(),
@@ -4111,7 +4111,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		set is supported and if so, set radset to the corresponding table-index numeric value: */
 		else if(STREQ(stFlag, "-radset"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 
 			// Check if it's a comma-separated actual set of complex-FFT radices:
 			char_addr = stFlag;
@@ -4119,7 +4119,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 			if(!cptr) {	// It's a radix-set index
 				i64arg = atol(stFlag);
 				// Must be < 2^32:
-				ASSERT(HERE, i64arg < 20, "radset-index argument must be < 2^32 ... halting.");
+				ASSERT(i64arg < 20, "radset-index argument must be < 2^32 ... halting.");
 				radset = (uint32)i64arg;
 			} else {	// It's a set of complex-FFT radices
 				numrad = 0;
@@ -4127,23 +4127,23 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 					// Copy substring into cbuf and null-terminate:
 					strncpy(cbuf,char_addr,(cptr-char_addr));	cbuf[cptr-char_addr] = '\0';
 					// Convert current radix to long and sanity-check:
-					i64arg = atol(cbuf);	ASSERT(HERE, !(i64arg>>12), "user-supplied radices must be < 2^12 ... halting.");
+					i64arg = atol(cbuf);	ASSERT(!(i64arg>>12), "user-supplied radices must be < 2^12 ... halting.");
 					rvec[numrad++] = (uint32)i64arg;
 					char_addr = cptr+1;
 				}
 				// A properly formatted radix-set arg will end with ',[numeric]', with the numeric in char_addr:
-				i64arg = atol(char_addr);	ASSERT(HERE, !(i64arg>>12), "user-supplied radices must be < 2^12 ... halting.");
+				i64arg = atol(char_addr);	ASSERT(!(i64arg>>12), "user-supplied radices must be < 2^12 ... halting.");
 				rvec[numrad++] = (uint32)i64arg;
 				rvec[numrad] = 0;	// Null-terminate the vector just for aesthetics
 				// Compute the radix product and make sure it's < 2^30, constraint due to the (fftlen < 2^31) one:
 				rad_prod = 1; i64arg = 1ull;
 				for(i = 0; i < numrad; i++) {
-					i64arg *= rvec[i];	ASSERT(HERE, !(i64arg>>30), "Product of complex-FFT radices supplied via -radset argument must be < 2^32 ... halting.");
+					i64arg *= rvec[i];	ASSERT(!(i64arg>>30), "Product of complex-FFT radices supplied via -radset argument must be < 2^32 ... halting.");
 				}
 				rad_prod = (uint32)i64arg;
 				// If user has supplied a real-FFT length (in Kdoubles) via -fftlen, product of the complex-FFT radices must equal half that value:
 				if(fftlen) {
-					ASSERT(HERE, (rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!");
+					ASSERT((rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!");
 				} else {
 					fftlen = rad_prod>>9;	// If user supplies fftlen via cmd-line arg after -radset, that's OK,
 								// we'll overwrite fftlen with user-supplied value and repeat the above check then
@@ -4163,55 +4163,55 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 					}
 				}
 				// The init-value of radset -1 getting overwritten with something >= 0 means success:
-				ASSERT(HERE, radset >= 0, "User-supplied set of complex-FFT radices not supported.");
+				ASSERT(radset >= 0, "User-supplied set of complex-FFT radices not supported.");
 			}
 		}
 
 		else if(STREQ(stFlag, "-shift"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			i64arg = atol(stFlag);
 			// Must be < 2^32, though store in a uint64 for later bignum-upgrades:
-			ASSERT(HERE, !(i64arg>>32), "shift argument must be < 2^32 ... halting.");
+			ASSERT(!(i64arg>>32), "shift argument must be < 2^32 ... halting.");
 			RES_SHIFT = i64arg;
 		}
 
 		// v20: Add p-1 support:
 		else if(STREQ(stFlag, "-b1"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			i64arg = atol(stFlag);
 			// Must be < 2^32:
-			ASSERT(HERE, !(i64arg>>32), "P-1 Stage 1 bound must be < 2^32 ... halting.");
+			ASSERT(!(i64arg>>32), "P-1 Stage 1 bound must be < 2^32 ... halting.");
 			B1 = (uint32)i64arg;
-			ASSERT(HERE, testType != TEST_TYPE_PRP, "b1-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable.");
+			ASSERT(testType != TEST_TYPE_PRP, "b1-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable.");
 			testType = TEST_TYPE_PM1;
 		}
 		else if(STREQ(stFlag, "-b2")) {
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			// Allow Stage 2 bounds to be > 2^32:
 			B2 = atol(stFlag);
-			ASSERT(HERE, testType != TEST_TYPE_PRP, "b2-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable.");
+			ASSERT(testType != TEST_TYPE_PRP, "b2-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable.");
 			testType = TEST_TYPE_PM1;
 		}
 		else if(STREQ(stFlag, "-b2_start")) {
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			// Allow Stage 2 bounds to be > 2^32:
 			B2_start = atol(stFlag);
-			ASSERT(HERE, testType != TEST_TYPE_PRP, "b2_start-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable.");
+			ASSERT(testType != TEST_TYPE_PRP, "b2_start-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable.");
 			testType = TEST_TYPE_PM1;
 		}
 
 		else if(STREQ(stFlag, "-nthread"))
 		{
 		#ifndef MULTITHREAD
-			ASSERT(HERE,0,"Multithreading must be enabled in build to permit -nthread argument!");
+			ASSERT(0,"Multithreading must be enabled in build to permit -nthread argument!");
 		#else
-			ASSERT(HERE,cpu == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!");
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			ASSERT(cpu == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!");
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			i64arg = atol(stFlag);
 			// Must be < 2^32:
-			ASSERT(HERE, !(i64arg>>32), "nthread argument must be < 2^32 ... halting.");
+			ASSERT(!(i64arg>>32), "nthread argument must be < 2^32 ... halting.");
 			NTHREADS = (uint32)i64arg;
 			nthread = TRUE;
 			// Use the same affinity-setting code here as for the -cpu option, but simply for cores [0:NTHREADS-1]:
@@ -4223,10 +4223,10 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		else if(STREQ(stFlag, "-cpu"))
 		{
 		#ifndef MULTITHREAD
-			ASSERT(HERE,0,"Multithreading must be enabled in build to permit -cpu argument!");
+			ASSERT(0,"Multithreading must be enabled in build to permit -cpu argument!");
 		#else
-			ASSERT(HERE,nthread == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!");
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			ASSERT(nthread == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!");
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			parseAffinityString(stFlag);
 			cpu = TRUE;
 		#endif
@@ -4236,10 +4236,10 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		else if(STREQ(stFlag, "-core"))
 		{
 		#ifndef MULTITHREAD
-			ASSERT(HERE,0,"Multithreading must be enabled in build to permit -core argument!");
+			ASSERT(0,"Multithreading must be enabled in build to permit -core argument!");
 		#else
-			ASSERT(HERE,cpu == FALSE && nthread == FALSE,"Only one of -nthread, -cpu and -core flags permitted!");
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			ASSERT(cpu == FALSE && nthread == FALSE,"Only one of -nthread, -cpu and -core flags permitted!");
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			NTHREADS = parseAffinityTriplet(stFlag,TRUE);	// 2nd-arg = TRUE: Use hwloc-generated topology, via '-core lo:hi[:threads_per_core]'
 			if(NTHREADS > MAX_THREADS) {
 				fprintf(stderr,"ERROR: NTHREADS [ = %d] must not exceed those of available logical cores = 0-%d!\n",NTHREADS,MAX_THREADS-1);
@@ -4256,7 +4256,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		else if(STREQ(stFlag, "-m") || STREQ(stFlag, "-mersenne"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			expo = atol(stFlag);
 			userSetExponent = 1;
 			// Use 0-pad slot in MvecPtr[] to store user-set-exponent data - that can point to either MersVec
@@ -4270,19 +4270,19 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		else if(STREQ(stFlag, "-prp"))	// This flag optionally takes a numeric base arg, and trips us into PRP-test mode
 		{
 			if(nargs < argc) {
-				strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+				strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 				if(isdigit(stFlag[0])) {
 					PRP_BASE = atol(stFlag);
 					if(PRP_BASE+1 == 0) {
-						snprintf_nowarn(cbuf,STR_MAX_LEN, "*** ERROR: Numeric arg to -prp flag, '%s', overflows uint32 field.\n", stFlag);
-						ASSERT(HERE,0,cbuf);
+						snprintf(cbuf,STR_MAX_LEN*2, "*** ERROR: Numeric arg to -prp flag, '%s', overflows uint32 field.\n", stFlag);
+						ASSERT(0,cbuf);
 					}
 				}
 				else
 					--nargs;
 			}
 			// Use 0-pad slot in MvecPRP[] to store user-set-exponent data:
-			ASSERT(HERE,MvecPtr == MersVec,"-prp flag invoked, but MvecPtr does not reflect the default MersVec init-value!");
+			ASSERT(MvecPtr == MersVec,"-prp flag invoked, but MvecPtr does not reflect the default MersVec init-value!");
 			MvecPtr = MvecPRP;
 			if(MersVec[numTest].exponent) {
 				MvecPtr[numTest].exponent = MersVec[numTest].exponent;
@@ -4296,17 +4296,17 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		else if(STREQ(stFlag, "-base"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			i64arg = atol(stFlag);
 			PRP_BASE = (uint32)i64arg;
 		}
 
 		else if(STREQ(stFlag, "-f") || STREQ(stFlag, "-fermat"))
 		{
-			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1);
 			i64arg = atol(stFlag);
 			// Must be < 2^32:
-			ASSERT(HERE, !(i64arg>>32), "Fermat-number-index argument must be < 2^32 ... halting.");
+			ASSERT(!(i64arg>>32), "Fermat-number-index argument must be < 2^32 ... halting.");
 			findex = (uint32)i64arg;
 			/* Make sure the Fermat number index is in range: */
 			if(findex < 13 || findex > 63) {
@@ -4328,7 +4328,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 	// Nov 2020: Sanity-check any p-1 bounds:
 	if(testType == TEST_TYPE_PM1) {
-		ASSERT(HERE, (modType == MODULUS_TYPE_MERSENNE || modType == MODULUS_TYPE_FERMAT) && userSetExponent, "P-1 in command-line mode requires a Mersenne or Fermat-number modulus to be specified via '-m [int]' or '-f [int]'.");
+		ASSERT((modType == MODULUS_TYPE_MERSENNE || modType == MODULUS_TYPE_FERMAT) && userSetExponent, "P-1 in command-line mode requires a Mersenne or Fermat-number modulus to be specified via '-m [int]' or '-f [int]'.");
 		pm1_check_bounds();
 	}
 
@@ -4355,7 +4355,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		if(iarg == 0) {
 			sprintf(cbuf  , "*** ERROR: Must specify a valid FFT length on command line before -radset argument!\n");
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		/* Make sure it's a valid radix set index for this FFT length: */
@@ -4367,7 +4367,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 			else
 				sprintf(cbuf  , "ERROR: Unknown error-code value %d from get_fft_radices(), called with radix set index %d, FFT length %d K\n",i,radset, iarg);
 
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 	}
@@ -4379,15 +4379,15 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 	if(modType == MODULUS_TYPE_MERSENNE && !selfTest)
 	{
 		if(userSetExponent) {
-			ASSERT(HERE, start > 0, "userSetExponent = TRUE but self-test starting-index unset!");
+			ASSERT(start > 0, "userSetExponent = TRUE but self-test starting-index unset!");
 			sprintf(cbuf, "ERROR: Production-run-mode [-iters not invoked] does not allow command-line\nsetting of exponent - that must be read from the %s file.\n",WORKFILE);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		} else if(start == -1) {
 			start = numTest; finish = start+1;
 		}
 		if(radset != -1) {
 			sprintf(cbuf, "ERROR: Production-run-mode [-iters not invoked] allows command-line setting of\nFFT length, but not the radix set - that must be read from the mlucas.cfg file.\n");
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 	ERNST_MAIN:
 		if((retVal = ernstMain(modType,testType,0,MvecPtr[start].fftLength,0,0,0,&Res64,&Res35m1,&Res36m1,scrnFlag,&runtime)) != 0)
@@ -4401,7 +4401,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 				k = (uint32)(retVal >> 8);
 				if((i = get_fft_radices(k, 0, 0x0, 0x0, 0)) != 0) {
 					sprintf(cbuf, "ERROR: FFT length %d K not available.\n",k);
-					fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 
 			/**** IF POSSIBLE, USE ONE OF THE STANDARD TEST EXPONENTS HERE, SO CAN CHECK RES64s!!! ****/
@@ -4424,7 +4424,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 			}
 			/* ...Otherwise barf. */
 			else {
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 		} else {
 			fprintf(stderr, "\n  Done ...\n\n");
@@ -4437,8 +4437,8 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		if(MvecPtr[start].exponent == 0)
 		{
 			i = MvecPtr[start].fftLength;
-			ASSERT(HERE, i > 0                  ,"Require i > 0                  ");
-			ASSERT(HERE, i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K");
+			ASSERT(i > 0                  ,"Require i > 0                  ");
+			ASSERT(i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K");
 
 			// If FFT length is not represented in reference-residue array, find nearest prime <= 0.99*given_N_get_maxP(FFT length):
 			for(j = 0; j < numTest; j++) {
@@ -4454,8 +4454,8 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 					}
 				}
 				if(expo < lo || lo >= hi) {
-					fprintf(stderr, "ERROR: unable to find a prime in the interval %llu <= x <= %llu.\n", lo, hi);
-					ASSERT(HERE, 0,"0");
+					fprintf(stderr, "ERROR: unable to find a prime in the interval %" PRIu64 " <= x <= %" PRIu64 ".\n", lo, hi);
+					ASSERT(0,"0");
 				}
 			} else {	/* Use the corresponding entry of MvecPtr: */
 				start = j; finish = start+1;
@@ -4469,8 +4469,8 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 	{
 		if(FermVec[start].Fidx == 0) {
 			i = FermVec[start].fftLength;
-			ASSERT(HERE, i > 0                  ,"Require i > 0                  ");
-			ASSERT(HERE, i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K");
+			ASSERT(i > 0                  ,"Require i > 0                  ");
+			ASSERT(i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K");
 
 			if(i > FermVec[numFerm-1].fftLength)	/* Computing a new-largest entry? */
 				FermVec[numFerm].Fidx = (i << 4);
@@ -4483,7 +4483,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 				}
 				if(lo >= numFerm) {
 					fprintf(stderr, "ERROR: unable to find FFT length %d K in the Reference Residue table.\n", i);
-					ASSERT(HERE, 0,"0");
+					ASSERT(0,"0");
 				}
 			}
 		}
@@ -4491,7 +4491,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 		else if(findex && (FermVec[numFerm].fftLength == 0))
 			FermVec[numFerm].fftLength = get_default_fft_length((uint64)1 << findex);
 	} else{
-		ASSERT(HERE, 0,"modType not recognized!");
+		ASSERT(0,"modType not recognized!");
 	}
 
 TIMING_TEST_LOOP:
@@ -4621,7 +4621,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 		if(iters == 100 || iters == 1000 || iters == 10000) {
 			mvec_res_t_idx = NINT( log((double)iters)/log(10.) ) - 2;	/* log10(iters) - 2, use slower NINT rather than DNINT here since latter needs correct rounding mode */
-			ASSERT(HERE, mvec_res_t_idx < 3,"main: mvec_res_t_idx out of range!");
+			ASSERT(mvec_res_t_idx < 3,"main: mvec_res_t_idx out of range!");
 			// Use empty-data-slot at top of MersVec[] or MvecPRP[], respectively, for primality & prp single-case tests:
 			if( (modType == MODULUS_TYPE_MERSENNE && MvecPtr[xNum].res_t[mvec_res_t_idx].sh0 == 0)
 			 || (modType == MODULUS_TYPE_FERMAT   && FermVec[xNum].res_t[mvec_res_t_idx].sh0 == 0) )
@@ -4672,7 +4672,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 				retVal = ernstMain(modType,testType,(uint64)MvecPtr[xNum].exponent,iarg,radix_set,maxFFT,iters,&Res64,&Res35m1,&Res36m1,scrnFlag,&runtime);
 			}
 			else
-				ASSERT(HERE, 0,"Unsupported modulus and/or test type!");
+				ASSERT(0,"Unsupported modulus and/or test type!");
 
 			// (retVal != 0) relates to dangerously high ROEs, use maxErr to decide whether to accept radix set.
 			/*** (to-do: factor in #occurrences) ***/
@@ -4774,7 +4774,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 			fp = mlucas_fopen(CONFIGFILE,FILE_ACCESS_MODE);
 			if(!fp) {
 				sprintf(cbuf  , "INFO: Unable to open %s file in %s mode ... \n", CONFIGFILE, FILE_ACCESS_MODE);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 
 			/* Put code version on line 1.
@@ -4799,7 +4799,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 			*/
 			if(get_fft_radices(iarg, radix_best, &NRADICES, RADIX_VEC, 10) != 0) {
 				sprintf(cbuf  , "ERROR: alleged best-radix-set index %u is unsupported.\n",radix_best);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			/* Zero-pad the radices-printing to the full length of the RADIX_VEC array
 			so each line has same length (needed to allow update mode):
@@ -4808,7 +4808,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests
 
 			/* If it's a new self-test residue being computed, add the SH residues to the .cfg file line */
 			if(new_data)
-				fprintf(fp, "\tp = %s: %d-iter Res mod 2^64, 2^35-1, 2^36-1 = %016llX, %11.0f, %11.0f",ESTRING,iters,new_res.sh0,(double)new_res.sh1,(double)new_res.sh2);
+				fprintf(fp, "\tp = %s: %d-iter Res mod 2^64, 2^35-1, 2^36-1 = %016" PRIX64 ", %11.0f, %11.0f",ESTRING,iters,new_res.sh0,(double)new_res.sh1,(double)new_res.sh2);
 
 			fprintf(fp,"\n");
 			fclose(fp); fp = 0x0;
@@ -4842,10 +4842,10 @@ uint64	parse_cmd_args_get_shift_value(void)
 	int i, nargs = 1;
 	while(global_argv[nargs])
 	{
-		strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN);
+		strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN-1);
 		if(STREQ(stFlag, "-shift"))
 		{
-			strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN);
+			strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN-1);
 			/* Convert the shift argument to a uint64: */
 			i64arg = 0;
 			for(i = 0; i < STR_MAX_LEN && stFlag[i] != '\0'; i++) {
@@ -4854,12 +4854,12 @@ uint64	parse_cmd_args_get_shift_value(void)
 					/* Check for overflow: */
 					if(i64arg % (uint64)10 != (uint64)(stFlag[i]-CHAROFFSET))
 					{
-						snprintf_nowarn(cbuf,STR_MAX_LEN, "*** ERROR: -shift argument %s overflows uint64 field.\n", stFlag);
-						fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+						snprintf(cbuf,STR_MAX_LEN*2, "*** ERROR: -shift argument %s overflows uint64 field.\n", stFlag);
+						fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 					}
 				} else {
-					snprintf_nowarn(cbuf,STR_MAX_LEN, "*** ERROR: Non-numeric character encountered in -shift argument %s.\n", stFlag);
-					fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					snprintf(cbuf,STR_MAX_LEN*2, "*** ERROR: Non-numeric character encountered in -shift argument %s.\n", stFlag);
+					fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 			}
 		}
@@ -4899,7 +4899,7 @@ int	cfgNeedsUpdating(char*in_line)
 
 const char*returnMlucasErrCode(uint32 ierr)
 {
-	ASSERT(HERE, ierr < ERR_MAX, "Error code out of range!");
+	ASSERT(ierr < ERR_MAX, "Error code out of range!");
 	return err_code[ierr-1];
 }
 
@@ -4918,7 +4918,7 @@ void	printMlucasErrCode(uint32 ierr)
 	/* High bytes should only be nonzero if low byte == ERR_RUN_SELFTEST_FORLENGTH: */
 	if((ierr>>8) != 0)
 	{
-		ASSERT(HERE, i==ERR_RUN_SELFTEST_FORLENGTH, "High bytes should only be nonzero if low byte == ERR_RUN_SELFTEST_FORLENGTH!");
+		ASSERT(i==ERR_RUN_SELFTEST_FORLENGTH, "High bytes should only be nonzero if low byte == ERR_RUN_SELFTEST_FORLENGTH!");
 	}
 }
 
@@ -5075,7 +5075,7 @@ int read_ppm1_residue(const uint32 nbytes, FILE*fp, uint8 arr_tmp[], uint64*Res6
 	for(i = nbytes; i < nbytes+j; i++) arr_tmp[i] = 0;
 	itmp64 = ((uint64*)arr_tmp)[0];
 	if(*Res64 != itmp64) {
-		sprintf(cbuf, "%s: On restart: Res64 checksum error! Got %llX, expected %llX\n"  ,func,itmp64,*Res64); return 0;
+		sprintf(cbuf, "%s: On restart: Res64 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n"  ,func,itmp64,*Res64); return 0;
 	}
 	// For big-endian CPUs, casting byte-array to uint64* gives byte-reversed limbs, so use a direct bitwise mod:
   #ifdef USE_BIG_ENDIAN
@@ -5115,17 +5115,17 @@ int read_ppm1_residue(const uint32 nbytes, FILE*fp, uint8 arr_tmp[], uint64*Res6
 		MOD_ADD64(bmod35,29,35,bmod35); MOD_ADD64(bmod36,28,36,bmod36);	// bmod35|36 += 29|28 (mod 35|36)
 	}
 	rmod35 = (rmod35 & two35m1) + (rmod35 >> 35); rmod36 = (rmod36 & two36m1) + (rmod36 >> 36);	// And do a final pair of folds to get mods
-	if(*Res35m1 != rmod35)	{ sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %llX, expected %llX\n",func,rmod35,*Res35m1); return 0; }
-	if(*Res36m1 != rmod36)	{ sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %llX, expected %llX\n",func,rmod36,*Res36m1); return 0; }
+	if(*Res35m1 != rmod35)	{ sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,rmod35,*Res35m1); return 0; }
+	if(*Res36m1 != rmod36)	{ sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,rmod36,*Res36m1); return 0; }
   #else
 	i = (nbytes+7)>>3;	// # of 64-bit limbs
 	itmp64 = mi64_div_by_scalar64((uint64*)arr_tmp,two35m1,i,0x0);
 	if(*Res35m1 != itmp64) {
-		sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %llX, expected %llX\n",func,itmp64,*Res35m1); return 0;
+		sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,itmp64,*Res35m1); return 0;
 	}
 	itmp64 = mi64_div_by_scalar64((uint64*)arr_tmp,two36m1,i,0x0);
 	if(*Res36m1 != itmp64) {
-		sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %llX, expected %llX\n",func,itmp64,*Res36m1); return 0;
+		sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,itmp64,*Res36m1); return 0;
 	}
   #endif
 	return 1;
@@ -5144,9 +5144,9 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin
 	uint128 ui128,vi128; uint192 ui192,vi192; uint256 ui256,vi256;	// Fixed-length 2/3/4-word ints for stashing results of multiword modexp.
 	*Res64 = 0ull;	// 0 value on return indicates failure of some kind
 	mi64_clear(pow,4); mi64_clear(rem,4);
-	ASSERT(HERE, arr1 != 0x0, "Null arr1 pointer!");
+	ASSERT(arr1 != 0x0, "Null arr1 pointer!");
 	if(!file_valid(fp)) {
-		sprintf(cbuf, "%s: File pointer invalid for read!\n",func);	ASSERT(HERE, 0, cbuf);
+		sprintf(cbuf, "%s: File pointer invalid for read!\n",func);	ASSERT(0, cbuf);
 	}
 	fprintf(stderr, " INFO: restart file %s found...reading...\n",fname);
 	/* t: */
@@ -5170,9 +5170,9 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin
 	if(TEST_TYPE == TEST_TYPE_PM1) {
 		if(strstr(fname, ".s2")) {
 			if(nsquares > 0xFFFFFFFFull)
-				ASSERT(HERE, B2_start <= nsquares, "P-1 stage 2 restart requires (B2_start in worktodo assignment) <= (savefile nsquares field)!");
+				ASSERT(B2_start <= nsquares, "P-1 stage 2 restart requires (B2_start in worktodo assignment) <= (savefile nsquares field)!");
 		} else {	// It's a stage 1 restart:
-			ASSERT(HERE, nsquares <= 0xFFFFFFFFull && nsquares < 1.5*(double)B1, "P-1 stage 1 restart: savefile nsquares value out of bounds!");
+			ASSERT(nsquares <= 0xFFFFFFFFull && nsquares < 1.5*(double)B1, "P-1 stage 1 restart: savefile nsquares value out of bounds!");
 		}
 		// If S2 restart and (nsquares > B2_start), read the ensuing S2 interim residue; if (nsquares == B2_start)
 		// it means S2 started but was aborted for some reason before writing an interim S2 residue. That will set
@@ -5180,10 +5180,10 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin
 		// the S2 code interprets as "start stage 2 from B2_start."
 	} else {	// For primality-tests, make sure nsquares < 2^32 and copy to ilo:
 		if(nsquares > p) {	// v21: change from >= p to > p, since Mersenne-PRP restart-to-check-CF will have nsquares == p:
-			sprintf(cbuf,"%s: nsquares = %llu out of range, should be < p = %llu\n",func, nsquares, p);
+			sprintf(cbuf,"%s: nsquares = %" PRIu64 " out of range, should be < p = %" PRIu64 "\n",func, nsquares, p);
 			return 0;
 		} else if(nsquares > 0xFFFFFFFFull) {
-			sprintf(cbuf,"%s: nsquares = %llu out of range, current limit = 2^32-1.\n",func, nsquares);
+			sprintf(cbuf,"%s: nsquares = %" PRIu64 " out of range, current limit = 2^32-1.\n",func, nsquares);
 			return 0;
 		}
 	}
@@ -5195,7 +5195,7 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin
 		TRANSFORM_TYPE = REAL_WRAPPER;
 	} else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) {
 		sprintf(cbuf, "%s: MODULUS_TYPE_FERMAT but (p mod 8) != 0",func);
-		ASSERT(HERE, (p & 7) == 0,cbuf);
+		ASSERT((p & 7) == 0,cbuf);
 		nbytes = (p>>3) + 1;
 		TRANSFORM_TYPE = RIGHT_ANGLE;
 	}
@@ -5243,10 +5243,10 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin
 	(Also added CRT routine to nt_utils.txt, which takes the 3 remainders mod the known prime factors and confirms the DIV result.)
 	*/
 	if(TEST_TYPE == TEST_TYPE_PRP) {
-		len = (nbytes+7)>>3; j = p&63; itmp64 = avec[len-1];	ASSERT(HERE, (itmp64 >> j) == 0ull, "High limb of residue array1 does not have upper bits cleared!");
+		len = (nbytes+7)>>3; j = p&63; itmp64 = avec[len-1];	ASSERT((itmp64 >> j) == 0ull, "High limb of residue array1 does not have upper bits cleared!");
 		for(i = 0; KNOWN_FACTORS[i] != 0ull; i += 4) {
 			j = mi64_getlen(KNOWN_FACTORS+i,4);	// j = number of nonzero limbs in curr_fac (alloc 4 limbs per in KNOWN_FACTORS[])
-			sprintf(cstr,"Computing %llu-squaring residue R (mod known prime q = %s)\n",nsquares,&cbuf[convert_mi64_base10_char(cbuf, KNOWN_FACTORS+i, j, 0)] ); mlucas_fprint(cstr,1);
+			sprintf(cstr,"Computing %" PRIu64 "-squaring residue R (mod known prime q = %s)\n",nsquares,&cbuf[convert_mi64_base10_char(cbuf, KNOWN_FACTORS+i, j, 0)] ); mlucas_fprint(cstr,1);
 			mi64_div(avec,KNOWN_FACTORS+i, len,j, 0x0,rem);	// R (mod p) returned in rem[]
 			k = mi64_getlen(rem,4);	// j = number of nonzero limbs in remainder
 			sprintf(cstr,"\tA: R == %s (mod q)\n",&cbuf[convert_mi64_base10_char(cbuf, rem, k, 0)] ); mlucas_fprint(cstr,1);
@@ -5268,13 +5268,13 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin
 				ui256 = twopmmodq256(vi256,ui256);	// pow' = 2^nsquares (mod p-1)
 				exp[0] = ui256.d0; exp[1] = ui256.d1; exp[2] = ui256.d2; exp[3] = ui256.d3;
 			} else
-				ASSERT(HERE, 0, "Only known-factors < 2^256 supported!");
+				ASSERT(0, "Only known-factors < 2^256 supported!");
 			// Raise PRP base (usually but not always 3) to the just-computed power; result in 4-limb local-array pow[]:
 			mi64_scalar_modpow_lr(PRP_BASE, exp, KNOWN_FACTORS+i, j, pow);
 			sprintf(cstr,"\tB: R == %s (mod q)\n",&cbuf[convert_mi64_base10_char(cbuf, pow, j, 0)] ); mlucas_fprint(cstr,1);
 			if (mi64_getlen(pow,4) != k || !mi64_cmp_eq(pow,rem,k)) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN,"Full-residue == %u^nsquares (mod q) check fails!", PRP_BASE); mlucas_fprint(cbuf,0);
-				ASSERT(HERE, 0, cbuf);
+				snprintf(cbuf,STR_MAX_LEN,"Full-residue == %u^nsquares (mod q) check fails!", PRP_BASE); mlucas_fprint(cbuf,0);
+				ASSERT(0, cbuf);
 			}
 		}
 	}
@@ -5311,7 +5311,7 @@ Thus if we use a negative-power algo, to recover 2^p (mod q = 2^k.qodd):
 
   // v19: For PRP-tests, also read a second Gerbicz-check residue array [arr2] and associated S-H checksum triplet [i1,i2,i3]:
   if(DO_GCHECK) {	// v21: Change to key off DO_GCHECK, to allow Fermat-mod Pepin-tests to use the Gerbicz check, too
-	ASSERT(HERE, arr2 != 0x0, "Null arr2 pointer!");
+	ASSERT(arr2 != 0x0, "Null arr2 pointer!");
 	PRP_BASE = 0ull;
 	for(j = 0; j < 4; j++) {
 		i = fgetc(fp);	PRP_BASE += i << (8*j);
@@ -5367,8 +5367,8 @@ void write_ppm1_residue(const uint32 nbytes, FILE*fp, const uint8 arr_tmp[], con
 	i = fwrite(arr_tmp, sizeof(char), nbytes, fp);
 	if(i != nbytes) {
 		fclose(fp); fp = 0x0;
-		snprintf_nowarn(cbuf,STR_MAX_LEN,"%s: Error writing residue to restart file.\n",func);
-		mlucas_fprint(cbuf,0);	ASSERT(HERE,0,cbuf);
+		snprintf(cbuf,STR_MAX_LEN*2,"%s: Error writing residue to restart file.\n",func);
+		mlucas_fprint(cbuf,0);	ASSERT(0,cbuf);
 	}
 	/* ...and checksums:	*/
 	/* Res64: */
@@ -5388,10 +5388,10 @@ void write_ppm1_savefiles(const char*fname, uint64 p, int n, FILE*fp, uint64 ihi
 	uint8 arr2[], uint64 i1   , uint64 i2     , uint64 i3     )
 {
 	uint32 i,kblocks,nbytes = 0;
-	ASSERT(HERE,file_valid(fp),"write_ppm1_savefiles: File pointer invalid for write!");
+	ASSERT(file_valid(fp),"write_ppm1_savefiles: File pointer invalid for write!");
 	// Make sure n is a proper (unpadded) FFT-length, i.e. is a multiple of 1K:
 	kblocks = (n >> 10);
-	ASSERT(HERE,n == (kblocks << 10),"Not a proper unpadded FFT length");
+	ASSERT(n == (kblocks << 10),"Not a proper unpadded FFT length");
 
 	/* See the function read_ppm1_savefiles() for the file format here: */
 	/* t: */
@@ -5407,7 +5407,7 @@ void write_ppm1_savefiles(const char*fname, uint64 p, int n, FILE*fp, uint64 ihi
 		nbytes = (p + 7)/8;
 		TRANSFORM_TYPE = REAL_WRAPPER;
 	} else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) {
-		ASSERT(HERE, p % 8 == 0,"write_ppm1_savefiles: p % 8 == 0");
+		ASSERT(p % 8 == 0,"write_ppm1_savefiles: p % 8 == 0");
 		nbytes = (p/8) + 1;	// We don't expect > p bits except in the highly unlikely case of a prime-Fermat Pepin-test result
 		TRANSFORM_TYPE = RIGHT_ANGLE;
 	}
@@ -5460,27 +5460,27 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 	uint64 curr_word, curr_wd64;
 	int pow2_fft;
 
-	ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!");
-	ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
+	ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!");
+	ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
 
-	ASSERT(HERE,TRANSFORM_TYPE,"TRANSFORM_TYPE not set!");
-	ASSERT(HERE,TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!");
+	ASSERT(TRANSFORM_TYPE,"TRANSFORM_TYPE not set!");
+	ASSERT(TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!");
 
 	/* Set the number of residue bytes, which is the same for Mersenne (2^p-1) and Fermat-mod (2^p+1, with p = 2^findex)
 	despite the fact the latter can formally be as large as 2^p, since only ever hit that if it`s the last residue of
 	a Pepin test and the number hqppens to be prime. (We would love for that exception to break some other ASSERTion in the code): */
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 	{
-		ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_bytewise_FP: TRANSFORM_TYPE == REAL_WRAPPER");
+		ASSERT(TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_bytewise_FP: TRANSFORM_TYPE == REAL_WRAPPER");
 	}
 	else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_bytewise_FP: TRANSFORM_TYPE == RIGHT_ANGLE");
+		ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_bytewise_FP: TRANSFORM_TYPE == RIGHT_ANGLE");
 		/* If Fermat number, make sure exponent a power of 2: */
 		findex = trailz64(p);
-		ASSERT(HERE, (p >> findex) == 1,"convert_res_bytewise_FP: (p >> findex) == 1");
+		ASSERT((p >> findex) == 1,"convert_res_bytewise_FP: (p >> findex) == 1");
 
-		ASSERT(HERE, p % 8 == 0,"convert_res_bytewise_FP: p % 8 == 0");
+		ASSERT(p % 8 == 0,"convert_res_bytewise_FP: p % 8 == 0");
 	}
 	nbytes = (p + 7)/8;
 	// Apply the circular shift:
@@ -5490,7 +5490,7 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 	/* Vector length a power of 2? */
 	pow2_fft = (n >> trailz32(n)) == 1;
 
-	bits[0] = p/n;		ASSERT(HERE, bits[0] > 1,"convert_res_bytewise_FP: bits[0] > 1");
+	bits[0] = p/n;		ASSERT(bits[0] > 1,"convert_res_bytewise_FP: bits[0] > 1");
 	base[0] = 1 << bits[0];
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT && pow2_fft == TRUE)
@@ -5542,7 +5542,7 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 			if(rbits < bits[ii])
 			{
 				itmp = curr_wd64;
-				ASSERT(HERE, itmp < (1ull<<rbits),"convert_res_bytewise_FP: itmp >= 2^rbits!");
+				ASSERT(itmp < (1ull<<rbits),"convert_res_bytewise_FP: itmp >= 2^rbits!");
 
 				/* Now grab the next 64 bits of the bytewise residue... */
 				curr_wd64 = 0;
@@ -5619,7 +5619,7 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 			if(rbits < bits[ii])
 			{
 				itmp = curr_wd64;
-				ASSERT(HERE, itmp < (1<<rbits),"convert_res_bytewise_FP: itmp >= 2^rbits!");
+				ASSERT(itmp < (1<<rbits),"convert_res_bytewise_FP: itmp >= 2^rbits!");
 
 				/* Now grab the next 64 bits of the bytewise residue... */
 				curr_wd64 = 0;
@@ -5662,9 +5662,9 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 	  }
 	}
 
-	ASSERT(HERE, curr_char == nbytes, "convert_res_bytewise_FP: curr_char == (p+7)/8");
-	ASSERT(HERE, nbits == p    ,"convert_res_bytewise_FP: nbits == p    ");
-	ASSERT(HERE, curr_wd64 == 0,"convert_res_bytewise_FP: curr_word == 0");
+	ASSERT(curr_char == nbytes, "convert_res_bytewise_FP: curr_char == (p+7)/8");
+	ASSERT(nbits == p    ,"convert_res_bytewise_FP: nbits == p    ");
+	ASSERT(curr_wd64 == 0,"convert_res_bytewise_FP: curr_word == 0");
 
 	/*
 	Fold any carryout from the conversion to balanced-representation form
@@ -5674,8 +5674,8 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 	/* Should have carryout of +1 Iff MS word < 0; otherwise expect 0 carry: */
 	if(cy && (a[j1] >= 0 || cy != +1))
 	{
-		sprintf(cbuf, "convert_res_bytewise_FP: Illegal combination of nonzero carry = %lld, most sig. word = %20.4f\n", cy, a[j]);
-		ASSERT(HERE, 0, cbuf);
+		sprintf(cbuf, "convert_res_bytewise_FP: Illegal combination of nonzero carry = %" PRId64 ", most sig. word = %20.4f\n", cy, a[j]);
+		ASSERT(0, cbuf);
 	}
 
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -5683,7 +5683,7 @@ int 	convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const
 	else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 		a[0] -= cy;
 	else
-		ASSERT(HERE, 0,"Illegal modulus type!");
+		ASSERT(0,"Illegal modulus type!");
 	return TRUE;
 }
 
@@ -5712,22 +5712,22 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 	const uint64 two35m1 = (uint64)0x00000007FFFFFFFFull, two36m1 = (uint64)0x0000000FFFFFFFFFull;	/* 2^35,36-1 */
 	uint64*u64_ptr = (uint64*)ui64_arr_out;
 
-	ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!");
-	ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
-	ASSERT(HERE,TRANSFORM_TYPE,"TRANSFORM_TYPE not set!");
-	ASSERT(HERE,TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!");
+	ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!");
+	ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!");
+	ASSERT(TRANSFORM_TYPE,"TRANSFORM_TYPE not set!");
+	ASSERT(TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!");
 
 	/* If Fermat number, make sure exponent a power of 2: */
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_FP_bytewise: TRANSFORM_TYPE == RIGHT_ANGLE");
+		ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_FP_bytewise: TRANSFORM_TYPE == RIGHT_ANGLE");
 		findex = trailz64(p);
-		ASSERT(HERE, (p >> findex) == 1,"convert_res_FP_bytewise: (p >> findex) == 1");
+		ASSERT((p >> findex) == 1,"convert_res_FP_bytewise: (p >> findex) == 1");
 	}
 	else if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
-		ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_FP_bytewise: TRANSFORM_TYPE == REAL_WRAPPER");
+		ASSERT(TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_FP_bytewise: TRANSFORM_TYPE == REAL_WRAPPER");
 	else
-		ASSERT(HERE, 0,"Illegal modulus type!");
+		ASSERT(0,"Illegal modulus type!");
 
 	/* Vector length a power of 2? */
 	pow2_fft = (n >> trailz32(n)) == 1;
@@ -5839,7 +5839,7 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 			atmp = a[j1];
 			if(atmp != NINT(atmp)) {
 				sprintf(cbuf,"%s: Input float-residue elements must have 0 fractional part! A[%u (of %u)] = %20.10f",func,j,n,atmp);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			itmp = (int64)(atmp+ cy);	/* current digit in int64 form, subtracting any borrow from the previous digit.	*/
 			if(itmp < 0) {			/* If current digit < 0, add the current base and set carry into next-higher digit = -1	*/
@@ -5848,11 +5848,11 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 			} else {
 				cy = 0;
 			}
-			ASSERT(HERE, itmp >= 0,"convert_res_FP_bytewise: itmp >= 0");
+			ASSERT(itmp >= 0,"convert_res_FP_bytewise: itmp >= 0");
 
 		/* Update 8-byte residue buffer last, since this one modifies itmp: */
-			ASSERT(HERE, rbits < 8,"convert_res_FP_bytewise: rbits < 8");
-			ASSERT(HERE, curr_wd64 < (1<<rbits),"convert_res_FP_bytewise: curr_wd64 >= 2^rbits!");
+			ASSERT(rbits < 8,"convert_res_FP_bytewise: rbits < 8");
+			ASSERT(curr_wd64 < (1<<rbits),"convert_res_FP_bytewise: curr_wd64 >= 2^rbits!");
 
 			itmp = (itmp << rbits) + curr_wd64;
 			curr_bits = bits[ii] + rbits;
@@ -5899,7 +5899,7 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 			atmp = a[j1];
 			if(atmp != NINT(atmp)) {
 				sprintf(cbuf,"%s: Input float-residue elements must have 0 fractional part! A[%u (of %u) = %20.10f] = ",func,j,n,atmp);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			itmp = (int64)(atmp+ cy);	/* current digit in int64 form, subtracting any borrow from the previous digit.	*/
 			if(itmp < 0) {			/* If current digit < 0, add the current base and set carry into next-higher digit = -1	*/
@@ -5908,11 +5908,11 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 			} else {
 				cy = 0;
 			}
-			ASSERT(HERE, itmp >= 0,"convert_res_FP_bytewise: itmp >= 0");
+			ASSERT(itmp >= 0,"convert_res_FP_bytewise: itmp >= 0");
 
 		/* Update 8-byte residue buffer last, since this one modifies itmp: */
-			ASSERT(HERE, rbits < 8,"convert_res_FP_bytewise: rbits < 8");
-			ASSERT(HERE, curr_wd64 < (1<<rbits),"convert_res_FP_bytewise: curr_wd64 >= 2^rbits!");
+			ASSERT(rbits < 8,"convert_res_FP_bytewise: rbits < 8");
+			ASSERT(curr_wd64 < (1<<rbits),"convert_res_FP_bytewise: curr_wd64 >= 2^rbits!");
 
 			itmp = (itmp << rbits) + curr_wd64;
 			curr_bits = bits[ii] + rbits;
@@ -5932,18 +5932,18 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 	if(cy && (!msw_lt0 || cy != -1))
 	{
 		sprintf(cbuf, "convert_res_FP_bytewise: Illegal combination of nonzero carry = %d, msw_lt0 = %d\n", cy, msw_lt0);
-		ASSERT(HERE, 0, cbuf);
+		ASSERT(0, cbuf);
 	}
 	/* Residue should contain ceiling(p/8) bytes: */
-	ASSERT(HERE, rbits < 8, "rbits >= 8");
+	ASSERT(rbits < 8, "rbits >= 8");
 	if(rbits) {
-		ASSERT(HERE, curr_wd64 < (1<<rbits),"convert_res_FP_bytewise: curr_wd64 >= 2^rbits!");
+		ASSERT(curr_wd64 < (1<<rbits),"convert_res_FP_bytewise: curr_wd64 >= 2^rbits!");
 		ui64_arr_out[curr_char++] = curr_wd64 & 255;
 		curr_wd64 >>= 8;
 	}
-	ASSERT(HERE, curr_char == (p+7)/8,"convert_res_FP_bytewise: curr_char == (p+7)/8");
-	ASSERT(HERE, nbits == p          ,"convert_res_FP_bytewise: nbits == p          ");
-	ASSERT(HERE, curr_wd64 == 0      ,"convert_res_FP_bytewise: curr_wd64 == 0      ");
+	ASSERT(curr_char == (p+7)/8,"convert_res_FP_bytewise: curr_char == (p+7)/8");
+	ASSERT(nbits == p          ,"convert_res_FP_bytewise: nbits == p          ");
+	ASSERT(curr_wd64 == 0      ,"convert_res_FP_bytewise: curr_wd64 == 0      ");
 
 	// Remove the circular shift ... have no mi64_shrc function, so use that b-bit rightward cshift equivalent to (p-b)-bit left-cshift.
 	// (But must guard against RES_SHIFT = 0, since in that case the left-shift count == p and mi64_shlc requires shift count strictly < p):
@@ -5953,7 +5953,7 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 	***/
 	j = (p+63)>>6;	// # of 64-bit limbs
 	if(RES_SHIFT) {
-	//	fprintf(stderr,"convert_res_FP_bytewise: removing shift = %llu\n",RES_SHIFT);
+	//	fprintf(stderr,"convert_res_FP_bytewise: removing shift = %" PRIu64 "\n",RES_SHIFT);
 		uint32 sign_flip = (MODULUS_TYPE == MODULUS_TYPE_FERMAT);
 		mi64_shlc(u64_ptr, u64_ptr, p, p-RES_SHIFT,j,sign_flip);
 		// If current residue R needed a sign-flip - again, this can only happen in the Fermat-mod case -
@@ -5968,7 +5968,7 @@ void	convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons
 	if(Res64  ) *Res64 = ((uint64*)ui64_arr_out)[0];
 	if(Res35m1) *Res35m1 = mi64_div_by_scalar64((uint64*)ui64_arr_out,two35m1,j,0x0);
 	if(Res36m1) *Res36m1 = mi64_div_by_scalar64((uint64*)ui64_arr_out,two36m1,j,0x0);
-//	fprintf(stderr,"Res35m1,Res36m1: %llu,%llu\n",*Res35m1,*Res36m1);
+//	fprintf(stderr,"Res35m1,Res36m1: %" PRIu64 ",%" PRIu64 "\n",*Res35m1,*Res36m1);
 }
 
 /*********************/
@@ -6029,7 +6029,7 @@ uint32 get_default_factoring_depth(uint64 p)
 int	is_hex_string(char*s, int len)
 {
 	int i;
-	ASSERT(HERE, s != 0x0, "Null ptr to is_hex_string()");
+	ASSERT(s != 0x0, "Null ptr to is_hex_string()");
 	for(i = 0; i < len; ++i)
 	{
 		if( !isxdigit(s[i]) )
@@ -6101,7 +6101,7 @@ char*check_kbnc(char*in_str, uint64*p) {
 		if((char_addr = strstr(cptr, ",")) == 0x0) {
 			fprintf(stderr,"%s: Expected ',' not found in assignment-specifying line!\n",func); break;
 		}
-		*p = strtoull(char_addr+1, &cptr, 10);	ASSERT(HERE, *p != -1ull, "strtoull() overflow detected.");
+		*p = strtoull(char_addr+1, &cptr, 10);	ASSERT(*p != -1ull, "strtoull() overflow detected.");
 		if(*p > PMAX) {
 			fprintf(stderr,"%s: Exponent n in modulus expression m = k*b^n+c exceeds limit! (Suggest checking for unsigned overflow.)\n",func); break;
 		}
@@ -6161,7 +6161,7 @@ void generate_JSON_report(
 	const char*pm1_status[2] = {"NF","F"};
 	const char*false_or_true[2] = {"false","true"};
 	// Attempt to read 32-hex-char Primenet assignment ID for current assignment (first line of WORKFILE):
-	ASSERT(HERE,(fp = mlucas_fopen(WORKFILE, "r")) != 0x0,"Workfile not found!");
+	ASSERT((fp = mlucas_fopen(WORKFILE, "r")) != 0x0,"Workfile not found!");
 	// v20.1.1: Parse first line whose leading non-WS char is alphabetic:
 	char_addr = 0x0;
 	while(fgets(in_line, STR_MAX_LEN, fp) != 0x0) {
@@ -6169,10 +6169,10 @@ void generate_JSON_report(
 		if(isalpha(*char_addr)) break;
 	}
 	fclose(fp); fp = 0x0;
-	ASSERT(HERE,strlen(char_addr) != 0 && isalpha(*char_addr),"Eligible assignment (leading non-WS char alphabetic) not found in workfile!");
+	ASSERT(strlen(char_addr) != 0 && isalpha(*char_addr),"Eligible assignment (leading non-WS char alphabetic) not found in workfile!");
 	if(!strstr(in_line, ESTRING) && !(MODULUS_TYPE == MODULUS_TYPE_FERMAT && strstr(in_line, BIN_EXP)) ) {
-		snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: Current exponent %s not found in %s file!\n",ESTRING,WORKFILE);
-		ASSERT(HERE,0,cbuf);
+		snprintf(cbuf,STR_MAX_LEN*2, "ERROR: Current exponent %s not found in %s file!\n",ESTRING,WORKFILE);
+		ASSERT(0,cbuf);
 	}
 	// Is there a Primenet-server 32-hexit assignment ID in the assignment line? If so, include it in the JSON output:
 	char_addr = strstr(in_line, "=");
@@ -6186,9 +6186,9 @@ void generate_JSON_report(
 	if(TEST_TYPE == TEST_TYPE_PRIMALITY) {
 		snprintf(ttype,10,"LL");
 		if(*aid) {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer,aid);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer,aid);
 		} else {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer);
 		}
 	} else if(TEST_TYPE == TEST_TYPE_PRP && KNOWN_FACTORS[0]) {	// PRP-CF result
 		// Print list of known factors used for CF test. Unlike the Primenet assignment formtting on the input side,
@@ -6207,42 +6207,42 @@ void generate_JSON_report(
 		strcat( cbuf, "]");
 		snprintf(ttype,10,"PRP-%u",PRP_BASE);
 		if(*aid) {
-			snprintf_nowarn(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid);
 		} else {
-			snprintf_nowarn(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer);
 		}
 	} else if(TEST_TYPE == TEST_TYPE_PRP) {	// Only support type-1 PRP tests, so hardcode that subfield:
 		snprintf(ttype,10,"PRP-%u",PRP_BASE);
 		if(*aid) {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid);
 		} else {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer);
 		}
 	} else if(TEST_TYPE == TEST_TYPE_PM1) {	// For p-1 assume there was an AID in the assignment, even if an all-0s one:
 		snprintf(ttype,10,"PM1");
 		if(!strlen(factor)) {	// No factor was found:
 		  if(*aid) {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer,aid);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer,aid);
 		  } else {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer);
 		  }
 		} else {	// The factor in the eponymous arglist field was found:
 		  if(B2 <= B1) {	// No stage 2 was run
 		   if(*aid) {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer,aid);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer,aid);
 		   } else {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer);
 		   }
 		  } else {	// Include B2 and flag indicating whether the s2 interval was completely covered or not. Factor must be in "" due to possibility of > 64-bit, which overflows a JSON int:
 		   if(*aid) {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer,aid);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer,aid);
 		   } else {
-			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer);
+			snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer);
 		   }
 		  }
 	}
 	} else
-		ASSERT(HERE, 0, "Unsupported test type!");
+		ASSERT(0, "Unsupported test type!");
 }
 
 /*********************/
@@ -6307,7 +6307,7 @@ void dif1_dit1_func_name(
 	case 4032:	*func_dif_pass1 = radix4032_dif_pass1;	*func_dit_pass1 = radix4032_dit_pass1;	break;
 //	case 4096:	*func_dif_pass1 = radix4096_dif_pass1;	*func_dit_pass1 = radix4096_dit_pass1;	break;
 	default:
-		sprintf(cbuf,"ERROR: radix %d not available for [dif,dit] pass1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+		sprintf(cbuf,"ERROR: radix %d not available for [dif,dit] pass1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
   }
 }
 
@@ -6325,14 +6325,14 @@ uint32 extract_known_factors(uint64 p, char*fac_start) {
 	uint64 *fac = 0x0, twop[4], quo[4],rem[4];	// fac = ptr to each mi64-converted factor input string;
 	uint256 p256,q256,res256;
 	char*cptr = fac_start+1;
-	ASSERT(HERE, fac_start[0] == '\"',"Known-factors line of worktodo must consist of a comma-separated list of such enclosed in double-quotes!");
+	ASSERT(fac_start[0] == '\"',"Known-factors line of worktodo must consist of a comma-separated list of such enclosed in double-quotes!");
 	/* If it's a Fermat number, need to check size of 2^ESTRING: */
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) {
 		findex = (uint32)p;
 		if(findex <= MAX_PRIMALITY_TEST_BITS)
 			p = (uint64)1 << findex;
 		else
-			ASSERT(HERE, 0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS");
+			ASSERT(0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS");
 	}
 	// Factors separated by commas (first clause of while()); list terminated with " (2nd clause):
 	while((char_addr = strstr(cptr,",")) != 0x0 || (char_addr = strstr(cptr,"\"")) != 0x0) {
@@ -6340,34 +6340,34 @@ uint32 extract_known_factors(uint64 p, char*fac_start) {
 		strncpy(cbuf,cptr,nchar);	cbuf[nchar] = '\0';	// Extract current-factor-as-string into cbuf
 		// Convert stringified factor f to mi64 form:
 		lenf = 0; fac = convert_base10_char_mi64(cbuf, &lenf);	// This does the mem-alloc for us
-		ASSERT(HERE, lenf > 0, "Error converting known-factor string!");
-		ASSERT(HERE, lenf < 5, "known-factor out of range, must be < 2^256!");
+		ASSERT(lenf > 0, "Error converting known-factor string!");
+		ASSERT(lenf < 5, "known-factor out of range, must be < 2^256!");
 		fbits = (lenf<<6) - mi64_leadz(fac, lenf);
 		// Make sure the alleged factor is of the proper form:
 		// For Mersenne M(p), q = 2.k.p + 1, with p prime; For Fermat F_n = 2^2^n+1, q = k.2^(n+2) + 1
 		// and we store the binary exponent 2^n in p, and 2^(n+2) in twop (yes, a misnomer in this case):
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
-			mi64_set_eq_scalar(twop,p<<1,lenf);	ASSERT(HERE, p < 0x8000000000000000ull, "Mersenne exponent limited to 63 bits!");
+			mi64_set_eq_scalar(twop,p<<1,lenf);	ASSERT(p < 0x8000000000000000ull, "Mersenne exponent limited to 63 bits!");
 		} else {
-			mi64_set_eq_scalar(twop,p<<2,lenf);	ASSERT(HERE, p < 0x4000000000000000ull, "Fermat-number index must be < 62!");
+			mi64_set_eq_scalar(twop,p<<2,lenf);	ASSERT(p < 0x4000000000000000ull, "Fermat-number index must be < 62!");
 		}
 		mi64_div(fac,twop, lenf,lenf, quo,rem);
-		i = mi64_cmp_eq_scalar(rem,1ull,lenf);	ASSERT(HERE, i,"Factor not of required form!");
+		i = mi64_cmp_eq_scalar(rem,1ull,lenf);	ASSERT(i,"Factor not of required form!");
 		// Alloc 4 limbs per factor in KNOWN_FACTORS; if current factor needs just 1 there's no uninited
 		// problem with the high limbs since KNOWN_FACTORS is zeroed at start of each new assignment:
-		ASSERT(HERE, nfac < 10, "Limit of 10 known factors!");
+		ASSERT(nfac < 10, "Limit of 10 known factors!");
 		mi64_set_eq(KNOWN_FACTORS + 4*nfac++,fac,lenf);
 		// Verify that F is a base-3 Fermat-PRP via binary modpow, 3^(q-1) == 1 (mod q):
-		ASSERT(HERE, mi64_pprimeF(fac,3ull,lenf),"Factor-is-base-3-PRP check fails!");
+		ASSERT(mi64_pprimeF(fac,3ull,lenf),"Factor-is-base-3-PRP check fails!");
 		// Verify that it's a factor via binary modpow:
 		p256.d0 = p; p256.d1 = p256.d2 = p256.d3 = 0ull;
 		q256.d0 = KNOWN_FACTORS[0];	q256.d1 = KNOWN_FACTORS[1];	q256.d2 = KNOWN_FACTORS[2];	q256.d3 = KNOWN_FACTORS[3];
 		res256 = twopmmodq256(p256,q256);
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
-			ASSERT(HERE, CMPEQ256(res256,ONE256),"Factor-divides-modulus check fails!");
+			ASSERT(CMPEQ256(res256,ONE256),"Factor-divides-modulus check fails!");
 		} else {
 			res256.d0 += 1ull;	// Fermat case: check that 2^p == -1 == q - 1 (mod q):
-			ASSERT(HERE, CMPEQ256(res256,q256),"Factor-divides-modulus check fails!");
+			ASSERT(CMPEQ256(res256,q256),"Factor-divides-modulus check fails!");
 		}
 		// If find any duplicate-entries in input list, warn & remove:
 		if(nfac > 1) {
@@ -6376,7 +6376,7 @@ uint32 extract_known_factors(uint64 p, char*fac_start) {
 				if(mi64_cmp_eq(KNOWN_FACTORS + 4*i, KNOWN_FACTORS + 4*(nfac-1), 4)) {
 					mi64_clear(KNOWN_FACTORS + 4*(--nfac), 4);
 					// Using cbuf as both string-arg and target string is problematic, so use 2nd string-global cstr as target:
-					snprintf_nowarn(cstr,STR_MAX_LEN, "WARNING: p = %llu, known-factor list entry %s is a duplicate ... removing.\n",p,cbuf);
+					snprintf(cstr,STR_MAX_LEN, "WARNING: p = %" PRIu64 ", known-factor list entry %s is a duplicate ... removing.\n",p,cbuf);
 					fprintf(stderr,"%s",cstr);
 				}
 			}
@@ -6384,9 +6384,9 @@ uint32 extract_known_factors(uint64 p, char*fac_start) {
 		cptr = char_addr+1;	// Advance 1-char past the current , or "
 	}
 	if(char_addr != 0x0) {
-		sprintf(cbuf,"%s: Unrecognized token sequence in parsing known-factors portion of assignment: \"%s\".",WORKFILE,fac_start);	ASSERT(HERE,0,cbuf);
+		sprintf(cbuf,"%s: Unrecognized token sequence in parsing known-factors portion of assignment: \"%s\".",WORKFILE,fac_start);	ASSERT(0,cbuf);
 	}
-	ASSERT(HERE, nfac != 0,"Must specify at least one known factor!");
+	ASSERT(nfac != 0,"Must specify at least one known factor!");
 // A bit of just-for-fun code: For smaller moduli N, use mi64 utils to see if cofactor C is a base-3 PRP:
 #if 0
 	const char mod_type[2] = {'-','+'}, *is_prp[] = {"is not","is"}, exclam[2] = {'.','!'};
@@ -6397,7 +6397,7 @@ uint32 extract_known_factors(uint64 p, char*fac_start) {
 	qvec = ALLOC_UINT64(qvec,j);	// Quotient stores cofactor C = N/F
 	if(!mvec || !qvec) {
 		sprintf(cbuf, "ERROR: unable to allocate arrays mvec,qvec in extract_known_factors.\n"); fprintf(stderr,"%s", cbuf);
-		ASSERT(HERE, 0,cbuf);
+		ASSERT(0,cbuf);
 	}
 
 	// Compute Modulus N ... note mi64-vecs have no cache-oriented element padding:
@@ -6423,13 +6423,13 @@ uint32 extract_known_factors(uint64 p, char*fac_start) {
 		mi64_mul_vector(BASE_MULTIPLIER_BITS,lenf, KNOWN_FACTORS+i,k, curr_fac,&lenf);
 		mi64_set_eq(BASE_MULTIPLIER_BITS,curr_fac,lenf);
 	}
-	ASSERT(HERE, lenf <= 20, "Product of factors too large to fit into curr_fac[]!");
+	ASSERT(lenf <= 20, "Product of factors too large to fit into curr_fac[]!");
 
 	// Since F << N, use Mont-mul-div for C - quotient overwrites N, no rem-vec needed, just verify that F is in fact a divisor:
-	ASSERT(HERE, 1 == mi64_div(mvec,BASE_MULTIPLIER_BITS, j,lenf, qvec,0x0), "C = N/F should have 0 remainder!");
+	ASSERT(1 == mi64_div(mvec,BASE_MULTIPLIER_BITS, j,lenf, qvec,0x0), "C = N/F should have 0 remainder!");
 	k = mi64_getlen(qvec,j);	// j = number of nonzero limbs in cofactor C
 	i = mi64_pprimeF(qvec,3,k);
-	printf("2^%llu %c 1 %s a base-3 Fermat-PRP%c\n",p,mod_type[MODULUS_TYPE == MODULUS_TYPE_FERMAT],is_prp[i],exclam[i]);
+	printf("2^%" PRIu64 " %c 1 %s a base-3 Fermat-PRP%c\n",p,mod_type[MODULUS_TYPE == MODULUS_TYPE_FERMAT],is_prp[i],exclam[i]);
 	free((void *)mvec);	mvec = 0x0;
 	exit(0);
 #endif
@@ -6451,7 +6451,7 @@ The decimal value of the GCD is returned in gcd_str, presumed to be dimensioned
 uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char*const gcd_str) {
 #if !INCLUDE_GMP
 	#warning INCLUDE_GMP defined == 0 at compile time ... No GCDs will be done on p-1 outputs.
-	snprintf(cbuf,STR_MAX_LEN,"INCLUDE_GMP defined == 0 at compile time ... No GCD will be done.\n");
+	snprintf(cbuf,STR_MAX_LEN*2,"INCLUDE_GMP defined == 0 at compile time ... No GCD will be done.\n");
 	mlucas_fprint(cbuf,1);
 	return 0;	// If user turns off p-1 support, keep the decl of gcd() to allow pm1.c to build
 #else
@@ -6463,8 +6463,8 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char*
 	uint32 i, retval = 0;
 	double tdiff = 0.0, clock1, clock2;
 	clock1 = getRealTime();
-	ASSERT(HERE, vec1 != 0x0, "Null-pointer vec1 input to GCD()!");
-	ASSERT(HERE,!(p && vec2), "One and only one of p and vec2 args to GCD() must be non-null!");
+	ASSERT(vec1 != 0x0, "Null-pointer vec1 input to GCD()!");
+	ASSERT(!(p && vec2), "One and only one of p and vec2 args to GCD() must be non-null!");
 	mpz_init(gmp_arr1); mpz_init(gmp_arr2);
 	// Init divisor, remainder, quotient, in case of nontrivial raw GCD and >= 1 known factors:
 	mpz_init(gmp_d); mpz_init(gmp_r); mpz_init(gmp_q);
@@ -6472,7 +6472,7 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char*
 	// Import vec1 into GMP array1, least-sign. element first, host byte order within each word, at 64-bit width:
 	mpz_import(gmp_arr1, nlimb, -1, sizeof(uint64), 0, 0, vec1);
 	if(p != 0) {
-		ASSERT(HERE, nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6, "Bad inputs to GCD()!");
+		ASSERT(nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6, "Bad inputs to GCD()!");
 		mpz_mul_2exp(gmp_arr2, gmp_one,gmp_exp);
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)	// 2^p-1:
 			mpz_sub(gmp_arr2, gmp_arr2,gmp_one);
@@ -6481,15 +6481,15 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char*
 	} else {
 		mpz_import(gmp_arr2, nlimb, -1, sizeof(uint64), 0, 0, vec2);
 	}
-	sz1 = mpz_sizeinbase(gmp_arr1,2);// gmp_printf("Input1 has %llu bits\n",sz1);
-	sz2 = mpz_sizeinbase(gmp_arr2,2);// gmp_printf("Input2 has %llu bits\n",sz2);
+	sz1 = mpz_sizeinbase(gmp_arr1,2);// gmp_printf("Input1 has %" PRIu64 " bits\n",sz1);
+	sz2 = mpz_sizeinbase(gmp_arr2,2);// gmp_printf("Input2 has %" PRIu64 " bits\n",sz2);
 	// Take gcd and return in gmp_arr1:
 	mpz_gcd(gmp_arr1, gmp_arr1,gmp_arr2);
 	gmp_size = mpz_sizeinbase(gmp_arr1,2);
 	if(gmp_size < 2) {
 		goto gcd_return;	// GCD = 0 or 1
 	} else {
-		if(KNOWN_FACTORS[0]) fprintf(stderr,"Raw GCD has %llu bits ... dividing out any known factors...\n",(uint64)gmp_size);
+		if(KNOWN_FACTORS[0]) fprintf(stderr,"Raw GCD has %" PRIu64 " bits ... dividing out any known factors...\n",(uint64)gmp_size);
 		for(i = 0; i < 40; i += 4) {	// Current limit = 10 factors, each stored in a 4-limb field, i.e. < 2^256
 			if(!KNOWN_FACTORS[i])
 				break;
@@ -6507,24 +6507,24 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char*
 	gmp_size = mpz_sizeinbase(gmp_arr1,10);
 	// Anything >= 900 digits (~90% the value of our STR_MAX_LEN dimensioning of I/O strings) treated as suspect:
 	if(gmp_size >= 900) {
-		snprintf_nowarn(cbuf,STR_MAX_LEN, "GCD has %u digits -- possible data corruption, aborting.\n",(uint32)gmp_size);
-		mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+		snprintf(cbuf,STR_MAX_LEN*2, "GCD has %u digits -- possible data corruption, aborting.\n",(uint32)gmp_size);
+		mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 	}
 	retval = 1;
 gcd_return:
 	if(!p) {
 		gmp_snprintf(gcd_str,STR_MAX_LEN,"%Zd",gmp_arr1);
-		gmp_snprintf(cbuf,STR_MAX_LEN,"GCD(A[%llu bits], B[%llu bits]) = %s\n",sz1,sz2,gcd_str);
+		gmp_snprintf(cbuf,STR_MAX_LEN*2,"GCD(A[%" PRIu64 " bits], B[%" PRIu64 " bits]) = %s\n",sz1,sz2,gcd_str);
 	} else if(retval) {
 		gmp_snprintf(gcd_str,STR_MAX_LEN,"%Zd",gmp_arr1);
-		gmp_snprintf(cbuf,STR_MAX_LEN,"Found %u-digit factor in Stage %u: %s\n",gmp_size,stage,gcd_str);
+		gmp_snprintf(cbuf,STR_MAX_LEN*2,"Found %u-digit factor in Stage %u: %s\n",gmp_size,stage,gcd_str);
 	} else {	// Caller can use either return value or empty gcd_str as proxy for "no factor found"
 		gcd_str[0] = '\0';
-		gmp_snprintf(cbuf,STR_MAX_LEN,"Stage %u: No factor found.\n",stage);
+		gmp_snprintf(cbuf,STR_MAX_LEN*2,"Stage %u: No factor found.\n",stage);
 	}
 	mlucas_fprint(cbuf,1);
 	clock2 = getRealTime(); tdiff = clock2 - clock1;
-	snprintf(cbuf,STR_MAX_LEN,"Time for GCD =%s\n",get_time_str(tdiff));
+	snprintf(cbuf,STR_MAX_LEN*2,"Time for GCD =%s\n",get_time_str(tdiff));
 	mlucas_fprint(cbuf,1);
 	// Done with the GMP arrays:
 	mpz_clear(gmp_arr1); mpz_clear(gmp_arr2); mpz_clear(gmp_d); mpz_clear(gmp_r); mpz_clear(gmp_q);
@@ -6562,20 +6562,20 @@ void modinv(uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb) {
 	uint64 *export_result_addr;
 	double tdiff = 0.0, clock1, clock2;
 	clock1 = getRealTime();
-	ASSERT(HERE, vec1 != 0x0 && vec2 != 0x0, "Null-pointer input to MODINV()!");
+	ASSERT(vec1 != 0x0 && vec2 != 0x0, "Null-pointer input to MODINV()!");
 	mpz_init(gmp_arr1); mpz_init(gmp_arr2);
 	mpz_init_set_ui(gmp_one,1ull); gmp_exp = p;
 	// Import vec1 into GMP array1, least-sign. element first, host byte order within each word, at 64-bit width:
 	// void mpz_import (mpz_t rop, size_t count, int order, size_t size, int	[Function] endian, size_t nails, const void *op)
 	mpz_import(gmp_arr1, nlimb, -1, sizeof(uint64), 0, 0, vec1);
-	ASSERT(HERE, (p != 0) && (nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6), "Bad inputs to MODINV()!");
+	ASSERT((p != 0) && (nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6), "Bad inputs to MODINV()!");
 	mpz_mul_2exp(gmp_arr2, gmp_one,gmp_exp);
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)	// 2^p-1:
 		mpz_sub(gmp_arr2, gmp_arr2,gmp_one);
 	else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)// F(m): p holds 2^m, so F(m) = 2^p+1:
 		mpz_add(gmp_arr2, gmp_arr2,gmp_one);
-//	gmp_printf("Input1 has %llu bits\n",mpz_sizeinbase(gmp_arr1,2));
-//	gmp_printf("Input2 has %llu bits\n",mpz_sizeinbase(gmp_arr2,2));
+//	gmp_printf("Input1 has %" PRIu64 " bits\n",mpz_sizeinbase(gmp_arr1,2));
+//	gmp_printf("Input2 has %" PRIu64 " bits\n",mpz_sizeinbase(gmp_arr2,2));
 	/*
 	GMP mod-inverse; arglist as for mpz_gcd but also returns int:
 
@@ -6590,13 +6590,13 @@ void modinv(uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb) {
 	retval = mpz_invert(gmp_arr1, gmp_arr1,gmp_arr2);
 	gmp_size = mpz_sizeinbase(gmp_arr1,2);
 	if(!retval) {
-		snprintf(cbuf,STR_MAX_LEN,"MODINV: Fatal error: inverse does not exist.\n");
-		mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf);
+		snprintf(cbuf,STR_MAX_LEN*2,"MODINV: Fatal error: inverse does not exist.\n");
+		mlucas_fprint(cbuf,0); ASSERT(0,cbuf);
 	}
 	// Export the result from gmp_arr1 to destination array vec2:
 	// void * mpz_export (void *rop, size_t *countp, int order, size_t size, int	[Function] endian, size_t nails, const mpz_t op)
 	export_result_addr = mpz_export(vec2, &inv_limbs, -1, sizeof(uint64), 0, 0, gmp_arr1);
-	ASSERT(HERE, inv_limbs <= nlimb && export_result_addr == vec2, "GMP was unable to export result to the specified target array!");
+	ASSERT(inv_limbs <= nlimb && export_result_addr == vec2, "GMP was unable to export result to the specified target array!");
 	// Explicitly zero any excess limbs left at top of vec2:
 	for(i = inv_limbs; i < nlimb; i++) {
 		vec2[i] = 0ull;
@@ -6637,7 +6637,7 @@ int restart_file_valid(const char*fname, const uint64 p, uint8*arr1, uint8*arr2)
 uint32 filegrep(const char*fname, const char*find_str, char*cstr, uint32 find_before_line_number)
 {
 	uint32 curr_line = 0, found_line = 0;
-	ASSERT(HERE, cstr != 0x0, "filegrep(): cstr pointer argument must be non-null!");
+	ASSERT(cstr != 0x0, "filegrep(): cstr pointer argument must be non-null!");
 	cstr[0] = '\0';
 	if(strlen(find_str) == 0)	// Nothing to find
 		return 0;
@@ -6657,7 +6657,7 @@ uint32 filegrep(const char*fname, const char*find_str, char*cstr, uint32 find_be
 		fclose(fptr);
 	} else {
 		sprintf(cbuf,"filegrep error: file %s not found.\n",fname);
-		ASSERT(HERE, 0, cbuf);
+		ASSERT(0, cbuf);
 	}
 	if(strlen(cstr) != 0)
 		return found_line;
@@ -6670,9 +6670,9 @@ void write_fft_debug_data(double a[], int jlo, int jhi)
 {
 	int j,j1;
 	const char dbg_fname[] = "FFT_DEBUG.txt";
-	ASSERT(HERE, dbg_file == 0x0, "dbg_file != 0x0 prior to mlucas_fopen");
+	ASSERT(dbg_file == 0x0, "dbg_file != 0x0 prior to mlucas_fopen");
 	dbg_file = mlucas_fopen(dbg_fname, "a");
-	ASSERT(HERE, dbg_file != 0x0, "Unable to open dbg_file!");
+	ASSERT(dbg_file != 0x0, "Unable to open dbg_file!");
 	fprintf(dbg_file, "RE_IM_STRIDE = %d\n", RE_IM_STRIDE);
 	fprintf(dbg_file, "%s\n", cbuf);
 
diff --git a/src/Mlucas.h b/src/Mlucas.h
old mode 100755
new mode 100644
diff --git a/src/br.c b/src/br.c
index 93d525cd..0e09182a 100755
--- a/src/br.c
+++ b/src/br.c
@@ -36,8 +36,8 @@ void print_pow2_twiddles(const uint32 n, const uint32 p, const uint32 q)
 	const char csigns[2] = {'+','-'};
 	const char re_im[2] = {'c','s'};
 	char prefix[3];	// 0-slot for overall sign; 1 for complex operator * [Re / Im interchange], 2 for ~ [complex conjugation].
-	ASSERT(HERE, n == (1<<lgn), "n not a power of 2!");
-	ASSERT(HERE, n == p*q, "n != p*q!");
+	ASSERT(n == (1<<lgn), "n not a power of 2!");
+	ASSERT(n == p*q, "n != p*q!");
 	printf("Fundamental-root powers for %d x %d impl of radix-%d DFT:\n",p,q,n);
 	for(i = 1; i < p; i++) {	// Skip 0-row, since those roots = 1
 		ir = reverse(i,lgp);
@@ -231,7 +231,7 @@ void bit_reverse_int(int vec[], int n, int nradices, int radix[], int incr, int*
 	/* If no scratch-space array provided, create one locally: */
 	if(arr_scratch) {
 		/* Don't allow reuse of main array for inits at this time: */
-		ASSERT(HERE, &vec[0] != &arr_scratch[0], "Array re-use not currently supported!");
+		ASSERT(&vec[0] != &arr_scratch[0], "Array re-use not currently supported!");
 		tmp = arr_scratch;
 	} else {
 		tmp = (int *)malloc(n*sizeof(int));
@@ -254,7 +254,7 @@ void bit_reverse_int(int vec[], int n, int nradices, int radix[], int incr, int*
 			i += incr;
 		}
 		printf("] != vector length [%u] in BIT_REVERSE_INT\n",n);
-		ASSERT(HERE,0,"Exiting.");
+		ASSERT(0,"Exiting.");
 	}
 
 	/*...We don't use the final radix for the bit reversal, we simply need it for array bounds checking. */
diff --git a/src/carry_dbg.h b/src/carry_dbg.h
index f8a468d4..16824443 100755
--- a/src/carry_dbg.h
+++ b/src/carry_dbg.h
@@ -470,7 +470,7 @@ printf("WARN: frac = %10.8f occurred in Re(a[%2u]) at j = %10d\n",frac,j,0);\
 		cy   = (temp*baseinv[i] + RND_A) - RND_B;\
 check_nint(cy, temp*baseinv[i]);\
 		x = (temp-cy*base[i]);\
-ASSERT(HERE, fabs(x+x) <= base[i], "X-output out of range!");\
+ASSERT(fabs(x+x) <= base[i], "X-output out of range!");\
 		x *= wt;\
 		\
 	  bjmodn -= sw;\
@@ -496,7 +496,7 @@ printf("WARN: frac = %10.8f occurred in Im(a[%2u]) at j = %10d\n",frac,j,0);\
 		cy   = (temp*baseinv[i] + RND_A) - RND_B;\
 check_nint(cy, temp*baseinv[i]);\
 		y = (temp-cy*base[i]);\
-ASSERT(HERE, fabs(y+y) <= base[i], "Y-output out of range!");\
+ASSERT(fabs(y+y) <= base[i], "Y-output out of range!");\
 		y *= wt;\
 		\
 	  bjmodn -= sw;\
@@ -528,7 +528,7 @@ printf("WARN: frac = %10.8f occurred in Re(a[%2u]) at j = %10d\n",frac,j,set);\
 		cy   = (temp*baseinv[i] + RND_A) - RND_B;\
 check_nint(cy, temp*baseinv[i]);\
 		x = (temp-cy*base[i]);\
-ASSERT(HERE, fabs(x+x) <= base[i], "X-output out of range!");\
+ASSERT(fabs(x+x) <= base[i], "X-output out of range!");\
 		x *= wt;\
 		\
 	  bjmodn -= sw;\
@@ -554,7 +554,7 @@ printf("WARN: frac = %10.8f occurred in Im(a[%2u]) at j = %10d\n",frac,j,set);\
 		cy   = (temp*baseinv[i] + RND_A) - RND_B;\
 check_nint(cy, temp*baseinv[i]);\
 		y = (temp-cy*base[i]);\
-ASSERT(HERE, fabs(y+y) <= base[i], "Y-output out of range!");\
+ASSERT(fabs(y+y) <= base[i], "Y-output out of range!");\
 		y *= wt;\
 		\
 	  bjmodn -= sw;\
@@ -721,7 +721,7 @@ sign  = ix & himask;\
 mant  = ix & mmask;\
 dexp  = (ix-sign)>>52;\
 shift = 1074 - dexp;\
-/*if(j1==0)printf("0xmant,shift,bits = %20llX  %10d  %10u\n",mant,shift,bits);*/\
+/*if(j1==0)printf("0xmant,shift,bits = %20" PRIX64 "  %10d  %10u\n",mant,shift,bits);*/\
 if(shift<0)printf("WARN: j1 = %10d  %20.15e gives negative shift count = %10d\n",j1,x,shift);\
 if(shift < 52)\
 {\
@@ -730,15 +730,15 @@ ifrac = mant << (63-shift);\
 if(ifrac > ifracmax) ifracmax=ifrac;\
 mant += ((uint64)1)<<shift;\
 mant  = (mant+two52)>>(shift+1);\
-/*if(j1==0)printf("A: 0xmant = %20llX\n",mant);*/\
+/*if(j1==0)printf("A: 0xmant = %20" PRIX64 "\n",mant);*/\
 mant -= (mant & sign)<<1;\
-/*if(j1==0)printf("B: 0xmant = %20llX\n",mant);*/\
+/*if(j1==0)printf("B: 0xmant = %20" PRIX64 "\n",mant);*/\
 word  = mant & (~(ones << bits));\
-/*if(j1==0)printf("C: 0xword = %20llX\n",word);*/\
+/*if(j1==0)printf("C: 0xword = %20" PRIX64 "\n",word);*/\
 topbit= word >> (bits - 1);\
-/*if(j1==0)printf("D: 0xtbit = %20llX\n",topbit);*/\
+/*if(j1==0)printf("D: 0xtbit = %20" PRIX64 "\n",topbit);*/\
 word -= topbit << bits;\
-/*if(j1==0)printf("E: 0xword = %20llX\n",word);*/\
+/*if(j1==0)printf("E: 0xword = %20" PRIX64 "\n",word);*/\
 x     = wt*(double)word;\
 cy    = (double)( (mant >> bits) + topbit );\
 /*if(j1==0)printf("%20.4f  %20.4f\n",x,cy);*/\
diff --git a/src/dft_macro.c b/src/dft_macro.c
index b3432a7e..4b41aa29 100755
--- a/src/dft_macro.c
+++ b/src/dft_macro.c
@@ -3396,18 +3396,18 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		if(thr_id == -1)	// Value of init stores #threads
 		{
 			if(init <= max_threads) {	// Previously inited with sufficient #threads
-				ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+				ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 				return;
 			}
 			max_threads = init;
 		#ifndef COMPILER_TYPE_GCC
-			ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+			ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 		#endif
 			if(sc_arr) { free((void *)sc_arr); }
 			// 126 slots for DFT-63 data, 22 for DFT-7,9 consts and DFT-7 pads, 4 to allow for alignment = 152:
-			sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = ALIGN_VEC_DBL(sc_arr);
-			ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 		#ifdef MULTITHREAD
 			__r0 = tdat = sc_ptr;	tmp = tdat + 126;
@@ -3491,12 +3491,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		#endif
 			return;
 		} else {
-			ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+			ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 		}	/* end of inits */
 
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		tdat = __r0 + thr_id*152;	tmp = tdat + 126;
 		two    = tmp + 0x0;
 		one    = tmp + 0x1;
@@ -3645,18 +3645,18 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		if(thr_id == -1)	// Value of init stores #threads
 		{
 			if(init <= max_threads) {	// Previously inited with sufficient #threads
-				ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+				ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 				return;
 			}
 			max_threads = init;
 		#ifndef COMPILER_TYPE_GCC
-			ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+			ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 		#endif
 			if(sc_arr) { free((void *)sc_arr); }
 			// 126 slots for DFT-63 data, 22 for DFT-7,9 consts and DFT-7 pads, 4 to allow for alignment = 152:
-			sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = ALIGN_VEC_DBL(sc_arr);
-			ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 		#ifdef MULTITHREAD
 			__r0 = tdat = sc_ptr;	tmp = tdat + 126;
@@ -3740,12 +3740,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		#endif
 			return;
 		} else {
-			ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+			ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 		}	/* end of inits */
 
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		tdat = __r0 + thr_id*152;	tmp = tdat + 126;
 		two    = tmp + 0x0;
 		one    = tmp + 0x1;
@@ -3877,17 +3877,17 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		if(thr_id == -1)	// Value of init stores #threads
 		{
 			if(init <= max_threads) {	// Previously inited with sufficient #threads
-				ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+				ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 				return;
 			}
 			max_threads = init;
 		#ifndef COMPILER_TYPE_GCC
-			ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+			ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 		#endif
 			if(sc_arr) { free((void *)sc_arr); }
-			sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = ALIGN_VEC_DBL(sc_arr);
-			ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 		#ifdef MULTITHREAD
 			__r0 = tmp = sc_ptr;
@@ -3925,7 +3925,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 			//	VEC_DBL_INIT(sqrt2, SQRT2);	VEC_DBL_INIT(tmp, ISRT2);
 				VEC_DBL_INIT(nisrt2,-dtmp);
 				VEC_DBL_INIT( isrt2, dtmp);									// Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround:
-				VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
+				VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
 				VEC_DBL_INIT( cc1, c64_1);		VEC_DBL_INIT( ss1, s64_1);		tmp =  cc1-1; VEC_DBL_INIT(tmp, dtmp);
 				VEC_DBL_INIT( cc2, c32_1);		VEC_DBL_INIT( ss2, s32_1);		tmp =  cc2-1; VEC_DBL_INIT(tmp, dtmp);
 				VEC_DBL_INIT( cc3, c64_3);		VEC_DBL_INIT( ss3, s64_3);		tmp =  cc3-1; VEC_DBL_INIT(tmp, dtmp);
@@ -4013,7 +4013,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		//	VEC_DBL_INIT(sqrt2, SQRT2);	VEC_DBL_INIT(tmp, ISRT2);
 			VEC_DBL_INIT(nisrt2,-dtmp);
 			VEC_DBL_INIT( isrt2, dtmp);									// Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround:
-			VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
+			VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
 			VEC_DBL_INIT( cc1, c64_1);		VEC_DBL_INIT( ss1, s64_1);		tmp =  cc1-1; VEC_DBL_INIT(tmp, dtmp);
 			VEC_DBL_INIT( cc2, c32_1);		VEC_DBL_INIT( ss2, s32_1);		tmp =  cc2-1; VEC_DBL_INIT(tmp, dtmp);
 			VEC_DBL_INIT( cc3, c64_3);		VEC_DBL_INIT( ss3, s64_3);		tmp =  cc3-1; VEC_DBL_INIT(tmp, dtmp);
@@ -4032,12 +4032,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		//	fprintf(stderr, "Init SSE2_RADIX_64_DIF with max_threads = %d\n",max_threads);
 			return;
 		} else {
-			ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+			ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 		}	/* end of inits */
 
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		tmp = __r0 + thr_id*0x32;
 		// To support FMA versions of the radix-8 macros used to build radix-64 we insert a standalone copy of the [2,1,sqrt2,isrt2] quartet:
 		two     = tmp + 0;	// AVX+ versions of various DFT macros assume consts 2.0,1.0,isrt2 laid out thusly
@@ -4280,17 +4280,17 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		if(thr_id == -1)	// Value of init stores #threads
 		{
 			if(init <= max_threads) {	// Previously inited with sufficient #threads
-				ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+				ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 				return;
 			}
 			max_threads = init;
 		#ifndef COMPILER_TYPE_GCC
-			ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+			ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 		#endif
 			if(sc_arr) { free((void *)sc_arr); }
-			sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = ALIGN_VEC_DBL(sc_arr);
-			ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 		#ifdef MULTITHREAD
 			__r0 = tmp = sc_ptr;
@@ -4328,7 +4328,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 			//	VEC_DBL_INIT(sqrt2, SQRT2);	VEC_DBL_INIT(tmp, ISRT2);
 				VEC_DBL_INIT(nisrt2,-dtmp);
 				VEC_DBL_INIT( isrt2, dtmp);									// Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround:
-				VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
+				VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
 				VEC_DBL_INIT( cc1, c64_1);		VEC_DBL_INIT( ss1, s64_1);		tmp =  cc1-1; VEC_DBL_INIT(tmp, dtmp);
 				VEC_DBL_INIT( cc2, c32_1);		VEC_DBL_INIT( ss2, s32_1);		tmp =  cc2-1; VEC_DBL_INIT(tmp, dtmp);
 				VEC_DBL_INIT( cc3, c64_3);		VEC_DBL_INIT( ss3, s64_3);		tmp =  cc3-1; VEC_DBL_INIT(tmp, dtmp);
@@ -4416,7 +4416,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		//	VEC_DBL_INIT(sqrt2, SQRT2);	VEC_DBL_INIT(tmp, ISRT2);
 			VEC_DBL_INIT(nisrt2,-dtmp);
 			VEC_DBL_INIT( isrt2, dtmp);									// Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround:
-			VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
+			VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
 			VEC_DBL_INIT( cc1, c64_1);		VEC_DBL_INIT( ss1, s64_1);		tmp =  cc1-1; VEC_DBL_INIT(tmp, dtmp);
 			VEC_DBL_INIT( cc2, c32_1);		VEC_DBL_INIT( ss2, s32_1);		tmp =  cc2-1; VEC_DBL_INIT(tmp, dtmp);
 			VEC_DBL_INIT( cc3, c64_3);		VEC_DBL_INIT( ss3, s64_3);		tmp =  cc3-1; VEC_DBL_INIT(tmp, dtmp);
@@ -4435,12 +4435,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		//	fprintf(stderr, "Init SSE2_RADIX_64_DIT with max_threads = %d\n",max_threads);
 			return;
 		} else {
-			ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
+			ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!");
 		}	/* end of inits */
 
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		tmp = __r0 + thr_id*0x32;
 		// To support FMA versions of the radix-8 macros used to build radix-64 we insert a standalone copy of the [2,1,sqrt2,isrt2] quartet:
 		two     = tmp + 0;	// AVX+ versions of various DFT macros assume consts 2.0,1.0,isrt2 laid out thusly
@@ -4888,7 +4888,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
 		// Index-offset names here reflect original unpermuted inputs, but the math also works for permuted ones:
 		int i,j,nshift, *off_ptr;
 		int p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pa,pb,pc,pd,pe,pf;
-		ASSERT(HERE, o_idx != 0x0, "Null o_idx pointer in SSE2_RADIX256_DIF!");
+		ASSERT(o_idx != 0x0, "Null o_idx pointer in SSE2_RADIX256_DIF!");
 	// NOTE that unlike the RADIX_08_DIF_OOP() macro used for pass 1 of the radix-64 DFT, RADIX_16_DIF outputs are IN-ORDER rather than BR:
 	  #ifdef USE_ARM_V8_SIMD
 		uint32 OFF1,OFF2,OFF3,OFF4;
diff --git a/src/f2psp.h b/src/f2psp.h
index 11ee2170..c6b7ba96 100755
--- a/src/f2psp.h
+++ b/src/f2psp.h
@@ -37,8 +37,8 @@ extern "C" {
 		retval	\
 	)\
 	{\
-		DBG_ASSERT(HERE, qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\
-		DBG_ASSERT(HERE, ((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\
+		DBG_ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\
+		DBG_ASSERT(((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\
 		__asm	mov	eax, array_64x8inputs	/* Assumes inputs a,b,c,d,... are 64-bit separated and &a[0} is 64-byte aligned */\
 		__asm	lea	ebx, q\
 		__asm	lea	ecx, qinv\
diff --git a/src/factor.c b/src/factor.c
index e23b1bbd..bed29234 100755
--- a/src/factor.c
+++ b/src/factor.c
@@ -196,7 +196,7 @@ int restart;
 	uint64 PMIN;	/* minimum #bits allowed for FFT-based mul */
 	uint64 PMAX;	/* maximum #bits allowed depends on max. FFT length allowed
 					  and will be determined at runtime, via call to given_N_get_maxP(). */
-	char cbuf[STR_MAX_LEN],cstr[STR_MAX_LEN];
+	char cbuf[STR_MAX_LEN*2],cstr[STR_MAX_LEN];
 	char in_line[STR_MAX_LEN];
 	/* Declare a blank STATFILE string to ease program logic: */
 	char STATFILE[] = "";
@@ -547,7 +547,7 @@ Unlike for (mod 60), use simple utility functions to manage these, rather than a
 
 int factor(char *pstring, double bmin, double bmax)
 {
-	ASSERT(HERE, 0, "TF currently not supported as part of Mlucas, only via standalone Mfactor build - please delete any .o files and retry USING 'makemake.sh mfac' from Mluas dir above /src.");
+	ASSERT(0, "TF currently not supported as part of Mlucas, only via standalone Mfactor build - please delete any .o files and retry USING 'makemake.sh mfac' from Mluas dir above /src.");
 	return 1;
 }
 
@@ -732,7 +732,7 @@ int main(int argc, char *argv[])
 	if(cudaError != cudaSuccess)
 	{
 		printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
-		ASSERT(HERE, 0, "factor.c : GPU-side error detected!");
+		ASSERT(0, "factor.c : GPU-side error detected!");
 	}
   #endif
 
@@ -743,7 +743,7 @@ int main(int argc, char *argv[])
 /* Allocate factor_k array and align on 16-byte boundary: */
 	factor_ptmp = ALLOC_UINT64(factor_ptmp, 24);
 	factor_k = ALIGN_UINT64(factor_ptmp);	factor_ptmp = 0x0;
-	ASSERT(HERE, ((uint64)factor_k & 0x3f) == 0, "factor_k not 64-byte aligned!");
+	ASSERT(((uint64)factor_k & 0x3f) == 0, "factor_k not 64-byte aligned!");
 
 /*...initialize logicals and factoring parameters...	*/
 	restart = FALSE;
@@ -924,14 +924,14 @@ Others are optional and in some cases mutually exclusive:
 		{
 			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
 			passmin = (uint32)convert_base10_char_uint64(stFlag);
-			ASSERT(HERE, passmin < TF_PASSES,"factor.c: passmin < TF_PASSES");
+			ASSERT(passmin < TF_PASSES,"factor.c: passmin < TF_PASSES");
 		}
 		else if(STREQ(stFlag, "-passmax"))
 		{
 			strncpy(stFlag, argv[nargs++], STR_MAX_LEN);
 			passmax = (uint32)convert_base10_char_uint64(stFlag);
-			ASSERT(HERE, passmax < TF_PASSES,"factor.c: passmax < TF_PASSES");
-			ASSERT(HERE, passmax >= passmin       ,"factor.c: passmax >= passmin");
+			ASSERT(passmax < TF_PASSES,"factor.c: passmax < TF_PASSES");
+			ASSERT(passmax >= passmin       ,"factor.c: passmax >= passmin");
 		}
 
 		// Number of threads to use?
@@ -953,7 +953,7 @@ Others are optional and in some cases mutually exclusive:
 				NTHREADS = itmp;
 			}
 		  #ifdef NWORD
-			ASSERT(HERE, NTHREADS == 1, "Arbitrary-precision build currently only supports single-threaded runs!");
+			ASSERT(NTHREADS == 1, "Arbitrary-precision build currently only supports single-threaded runs!");
 		  #endif
 		#endif
 		}
@@ -968,11 +968,11 @@ Others are optional and in some cases mutually exclusive:
   #else
 
 	/* If non-standalone mode, make sure statfile name is non-empty: */
-	ASSERT(HERE, STRNEQ(STATFILE, ""), "STATFILE string empty");
+	ASSERT(STRNEQ(STATFILE, ""), "STATFILE string empty");
 	fp = mlucas_fopen(STATFILE, "a");
 	if(!fp) {
 		fprintf(stderr,"ERROR: Unable to open statfile %s for writing.\n",STATFILE);
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	} else {
 		fclose(fp); fp = 0x0;
 	}
@@ -985,12 +985,12 @@ Others are optional and in some cases mutually exclusive:
 		first_entry = FALSE;
 	#ifndef MULTITHREAD
 		#warning Building factor.c in unthreaded (i.e. single-main-thread) mode.
-		ASSERT(HERE, NTHREADS == 1, "NTHREADS must == 1 in single-threaded mode!");
+		ASSERT(NTHREADS == 1, "NTHREADS must == 1 in single-threaded mode!");
 		k_to_try = (uint64 *)calloc(TRYQ * NTHREADS, sizeof(uint64));
 	#else
 		MAX_THREADS = get_num_cores();
-		ASSERT(HERE, MAX_THREADS > 0, "Illegal #Cores value stored in MAX_THREADS");
-		ASSERT(HERE, MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h .");
+		ASSERT(MAX_THREADS > 0, "Illegal #Cores value stored in MAX_THREADS");
+		ASSERT(MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h .");
 
 		if(!NTHREADS) {
 			NTHREADS = 1;
@@ -998,7 +998,7 @@ Others are optional and in some cases mutually exclusive:
 			// Use the same affinity-setting code here as for the -cpu option, but simply for cores [0:NTHREADS-1]:
 		} else if(NTHREADS > MAX_CORES) {
 			sprintf(cbuf,"FATAL: NTHREADS = %d exceeds the MAX_CORES setting in Mdata.h = %d\n", NTHREADS, MAX_CORES);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		} else {	// In timing-test mode, allow #threads > #cores
 			if(NTHREADS > MAX_THREADS) {
 				fprintf(stderr,"WARN: NTHREADS = %d exceeds number of cores = %d\n", NTHREADS, MAX_THREADS);
@@ -1021,7 +1021,7 @@ Others are optional and in some cases mutually exclusive:
 		// do TF_PASSES 'work units' (factoring passes for various (k mod TF_CLASSES) k-classes:
 		main_work_units = 0;
 		pool_work_units = NTHREADS;
-		ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+		ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 		printf("Factor.c: Init threadpool of %d threads\n", NTHREADS);
 
 		// Apr 2015: Init-calls to any inline-asm-using modpow functions:
@@ -1050,7 +1050,7 @@ Others are optional and in some cases mutually exclusive:
 // Oct 2015: GCD-associated self-tests provides a fair bit of added coverage of the mi64 library, so always include:
   #ifdef INCLUDE_PM1
 	/* Simple self-tester for GCD routines in gcd_lehmer.c: */
-	ASSERT(HERE, test_gcd() == 0, "Factor_init : GCD test failed.\n");
+	ASSERT(test_gcd() == 0, "Factor_init : GCD test failed.\n");
 exit(0);
   #endif
 
@@ -1058,44 +1058,44 @@ exit(0);
 	command-line parameter, will attempt to read the other needed run parameters
 	from the corresponding checkpoint file:
 	*/
-	ASSERT(HERE, STRNEQ(pstring,""),"factor.c : pstring empty!");
+	ASSERT(STRNEQ(pstring,""),"factor.c : pstring empty!");
 
 	/* -bmin/bmax used to set bounds for factoring: */
 	if(bmin || bmax) {
-		ASSERT(HERE, (kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0)");
+		ASSERT((kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0)");
 
 		if(bmin < 0) {
-			fprintf(stderr,"ERROR: log2(min factor) must be >= 0. Offending entry = %lf.\n", bmin);		ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: log2(min factor) must be >= 0. Offending entry = %lf.\n", bmin);		ASSERT(0,"0");
 		} else if(bmin >= MAX_BITS_Q) {
-			fprintf(stderr,"ERROR: log2(min factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmin);	ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: log2(min factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmin);	ASSERT(0,"0");
 		}
 
 		if(bmax <= 0) {
-			fprintf(stderr,"ERROR: log2(max factor) must be > 0. Offending entry = %lf.\n", bmax);		ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: log2(max factor) must be > 0. Offending entry = %lf.\n", bmax);		ASSERT(0,"0");
 		} else if(bmax > MAX_BITS_Q) {
-			fprintf(stderr,"ERROR: log2(max factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmax);	ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: log2(max factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmax);	ASSERT(0,"0");
 		}
 
 		if(bmax < bmin) {
-			fprintf(stderr,"ERROR: (bmax = %lf) < (bmin = %lf)!\n", bmax, bmin);	ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: (bmax = %lf) < (bmin = %lf)!\n", bmax, bmin);	ASSERT(0,"0");
 		}
 	}
 
 	/* -kmin/kmax used to set bounds for factoring: */
 	if(kmin || kmax) {
-		ASSERT(HERE, kmax != 0 ,"factor.c: kmax not set!");
-		ASSERT(HERE, (int64)kmax > 0, "kmax must be 63 bits or less!");
-		ASSERT(HERE, (bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)");
+		ASSERT(kmax != 0 ,"factor.c: kmax not set!");
+		ASSERT((int64)kmax > 0, "kmax must be 63 bits or less!");
+		ASSERT((bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)");
 
 		if(kmax < kmin) {
 			fprintf(stderr,"ERROR: (kmax = %s) < (kmin = %s)!\n", &char_buf0[convert_uint64_base10_char(char_buf0, kmax)], &char_buf1[convert_uint64_base10_char(char_buf1, kmin)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
-	ASSERT(HERE, bmax > 0.0 || kmax != 0 ,"factor.c: One of bmax or kmax must be set!");
+	ASSERT(bmax > 0.0 || kmax != 0 ,"factor.c: One of bmax or kmax must be set!");
 
-	ASSERT(HERE, (MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
+	ASSERT((MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 			  || (MODULUS_TYPE == MODULUS_TYPE_MERSMERS)
 			  || (MODULUS_TYPE ==   MODULUS_TYPE_FERMAT)
 				, "Unsupported modulus type!");
@@ -1119,7 +1119,7 @@ exit(0);
 		nbits_in_p = findex;
 		if(findex > 1000) {	// Large MMp need deeper sieving on each k passing the default sieve
 			kdeep = (uint32 *)calloc( 1024, sizeof(uint32));
-			ASSERT(HERE, kdeep != 0x0, "Calloc of kdeep[] failed!");
+			ASSERT(kdeep != 0x0, "Calloc of kdeep[] failed!");
 		}
 		lenP = (nbits_in_p + 63)>>6;
 		p     = (uint64 *)calloc( ((uint32)MAX_BITS_P + 63)>>6, sizeof(uint64));
@@ -1131,7 +1131,7 @@ exit(0);
 	} else {
 		// Convert stringified exponent to mi64 form, using same #limbs as for factor candidates:
 		p = convert_base10_char_mi64(pstring, &lenQ);	// This does the mem-alloc for us in this case
-		lenP = mi64_getlen(p, lenQ); ASSERT(HERE, lenP > 0, "factor.c: Error converting pstring!");
+		lenP = mi64_getlen(p, lenQ); ASSERT(lenP > 0, "factor.c: Error converting pstring!");
 		nbits_in_p = (lenP<<6) - mi64_leadz(p, lenP);
 	}
 
@@ -1157,14 +1157,14 @@ exit(0);
 	// Mersenne numbers must have odd (check primality further on) exponents:
 	if((MODULUS_TYPE != MODULUS_TYPE_FERMAT) && (p[0] & 1) == 0)
     {
-		fprintf(stderr,"p must be odd! Offending p = %s\n", pstring); ASSERT(HERE, 0,"0");
+		fprintf(stderr,"p must be odd! Offending p = %s\n", pstring); ASSERT(0,"0");
 	}
 
 	/* For purposes of the bits-in-p limit, treat Fermat numbers as having 2^findex rather than 2^findex + 1 bits: */
 	if((nbits_in_p - (MODULUS_TYPE == MODULUS_TYPE_FERMAT)) > MAX_BITS_P)
 	{
 		fprintf(stderr,"p too large - limit is %u bits. Offending p = %s\n", MAX_BITS_P, pstring);
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 	// To track lg(q) = lg(2.k.p+1), use approximation q ~= 2.k.p, thus lg(q) ~= lg(2.p) + lg(k).
 	fbits_in_2p = (double)mi64_extract_lead64(p, lenP, &itmp64) - 64;
@@ -1173,16 +1173,16 @@ exit(0);
 //printf("fbits_in_2p += log((double)itmp64)*ILG2 [= %10.4f] = %10.4f\n",log((double)itmp64)*ILG2,fbits_in_2p);
   #if 0	// 11/2013: No clue what I was thinking here...
 	// If 2p < 2^64 we left-justify the leading bits to make result lie in [2^63, 2^64), so result here must always be > 2^63:
-	ASSERT(HERE, fbits_in_2p >= 63, "fbits_in_2p out of range!");
+	ASSERT(fbits_in_2p >= 63, "fbits_in_2p out of range!");
 	fbits_in_2p += nbits_in_p - 64.0;	// lg(2.p) ... Cast 64 to double to avoid signed-int subtract of RHS terms.
   #endif
 	// Do some quick sanity tests of exponent for the various kinds of moduli:
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, findex == mi64_trailz(p, lenP), "Internal Fermat-exponent bad power of 2!");
+		ASSERT(findex == mi64_trailz(p, lenP), "Internal Fermat-exponent bad power of 2!");
 		mi64_shrl(p, q, findex, lenP,lenP);
 		mi64_sub_scalar(q, 1ull, q, lenP);
-		ASSERT(HERE, mi64_iszero(q, lenP), "Internal Fermat-exponent not a power of 2!");
+		ASSERT(mi64_iszero(q, lenP), "Internal Fermat-exponent not a power of 2!");
 	}
 	else
 	{
@@ -1193,14 +1193,14 @@ exit(0);
 			{
 				if(findex == knowns[i]) { break; }
 			}
-			ASSERT(HERE, (knowns[i] != 0), "Double-Mersenne exponent not a known Mersenne prime!");
+			ASSERT((knowns[i] != 0), "Double-Mersenne exponent not a known Mersenne prime!");
 
 			// And now proceed to all-binary-ones test of vector-form M(p):
 			mi64_add_scalar(p, 1ull, q, lenP);
-			ASSERT(HERE, findex == mi64_trailz(q, lenP), "Internal M(M(p))-exponent bad power of 2!");
+			ASSERT(findex == mi64_trailz(q, lenP), "Internal M(M(p))-exponent bad power of 2!");
 			mi64_shrl(q, q, findex, lenP,lenP);
 			mi64_sub_scalar(q, 1ull, q, lenP);
-			ASSERT(HERE, mi64_iszero(q, lenP), "Internal M(M(p))-exponent fails all-binary-ones check!");
+			ASSERT(mi64_iszero(q, lenP), "Internal M(M(p))-exponent fails all-binary-ones check!");
 		}
 		// We can use a lookup table vs known M(p) for all cases, but if Mersenne or M(M(p)) with suitably small p,
 		// add a base-2 Fermat PRP test, more as a self-test of the various modpow routines than anything else:
@@ -1209,7 +1209,7 @@ exit(0);
 			if(!mi64_twopmodq(q, lenP, 0, p, lenP, 0x0))
 			{
 				fprintf(stderr,"WARNING: p = %s is not prime ... proceeding anyway, on presumption user wants this.\n", pstring);
-			//	ASSERT(HERE, 0,"0");	Dec 2019 ... allowing odd composite exponents can still be useful, e.g. ATH used to TF M(p^2) for known Mersenne primes
+			//	ASSERT(0,"0");	Dec 2019 ... allowing odd composite exponents can still be useful, e.g. ATH used to TF M(p^2) for known Mersenne primes
 			}
 		}
 	}
@@ -1278,14 +1278,14 @@ exit(0);
 	if(kmax) {
 		interval_hi = (uint64)ceil((double)kmax/((uint64)len << TF_CLSHIFT));	// Copied from restart-file code below
 		u64_arr[lenP] = mi64_mul_scalar( p, 2*interval_hi*(len << TF_CLSHIFT), u64_arr, lenP);
-		ASSERT(HERE, lenQ == lenP+(u64_arr[lenP] != 0), "");
+		ASSERT(lenQ == lenP+(u64_arr[lenP] != 0), "");
 
 		nbits_in_q = (lenQ<<6) - mi64_leadz(u64_arr, lenQ);
 
 		if(nbits_in_q > MAX_BITS_Q)
 		{
 			fprintf(stderr,"qmax too large - limit is %u bits. Offending p, kmax = %s, %s\n", MAX_BITS_Q, pstring, &char_buf0[convert_uint64_base10_char(char_buf0, kmax)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -1311,7 +1311,7 @@ exit(0);
 	else
 		fprintf(stderr,"INFO: Will write checkpoint data to savefile %s.\n",RESTARTFILE);
 
-	fprintf(stderr,"INFO: Will write savefile %s every 2^%u = %llu factor candidates tried.\n",RESTARTFILE,CMASKBITS,countmask+1);
+	fprintf(stderr,"INFO: Will write savefile %s every 2^%u = %" PRIu64 " factor candidates tried.\n",RESTARTFILE,CMASKBITS,countmask+1);
 
 	/**** process restart-file and any command-line params: ****/
 	// Note: return value of read_savefile is signed:
@@ -1323,9 +1323,9 @@ exit(0);
 		fq = mlucas_fopen(STATFILE,"a"); fprintf(fq,"%s",cbuf); fclose(fq); fq = 0x0;
 	#endif
 		// Init savefile with above read_savefile fields so ensuing checkpoint-writes only need to update the pass# and k:
-//		ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!");
+//		ASSERT(0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!");
 	} else {
-		ASSERT(HERE,!itmp,"There were errors reading the savefile ... aborting");
+		ASSERT(!itmp,"There were errors reading the savefile ... aborting");
 		count = 0ull;	// Need to reset == 0 prior to sieving so kvector-fill code works properly
 
 		/* If previous run is not yet complete, ignore any increased factor-bound-related
@@ -1349,19 +1349,19 @@ exit(0);
 			****/
 			if(bmin || bmax) {
 			#if(!defined(P1WORD))
-			//	ASSERT(HERE, 0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!");
+			//	ASSERT(0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!");
 			#endif
-				ASSERT(HERE, (kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run.");
+				ASSERT((kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run.");
 
 				if(bmin) {
-					ASSERT(HERE, bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file");
+					ASSERT(bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file");
 					if(bmin < bmax_file)
 						fprintf(stderr,"WARNING: Specified bmin (%lf) smaller than previous-run bmax = %lf. Setting equal to avoid overlapping runs.\n", bmin, bmax_file);
 				}
 				bmin = bmax_file;
 				/* We expect any command-line bmax will be > that in the restart file: */
 				if(bmax)
-					ASSERT(HERE, bmax > bmax_file - 0.0000000001,"bmax >= bmax_file");
+					ASSERT(bmax > bmax_file - 0.0000000001,"bmax >= bmax_file");
 			}
 
 			/****
@@ -1371,26 +1371,26 @@ exit(0);
 					if not we warn and set kmin = kmax_file), and that kmax > kmax_file.
 			****/
 			if(kmin || kmax) {
-				ASSERT(HERE, (bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)");
+				ASSERT((bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)");
 				if(kmin) {
-					ASSERT(HERE, kmin >= kmin_file,"kmin >= kmin_file");
+					ASSERT(kmin >= kmin_file,"kmin >= kmin_file");
 					if(kmin < kmax_file)
 						fprintf(stderr,"WARNING: Specified kmin (%s) smaller than previous-run kmax = %s. Setting equal to avoid overlapping runs.\n", &char_buf0[convert_uint64_base10_char(char_buf0, kmax)], &char_buf1[convert_uint64_base10_char(char_buf1, kmax_file)]);
 				}
 				kmin = kmax_file;
 				/* We expect any command-line kmax will be > that in the restart file: */
 				if(kmax)
-					ASSERT(HERE, kmax > kmax_file,"kmax >= kmax_file");
+					ASSERT(kmax > kmax_file,"kmax >= kmax_file");
 			}
 
 			/****
 				3) -kplus used to increment an upper bound from a previous factoring run:
 			****/
 			if(kplus) {
-				ASSERT(HERE, (bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)");
+				ASSERT((bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)");
 				kmin = kmax_file;
 				/* Ensure incremented value kmax fits into a 64-bit unsigned int: */
-				ASSERT(HERE, (kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!");
+				ASSERT((kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!");
 				kmax = kmin + kplus;
 				kplus = 0;	/* If kplus != 0 detected further on, that indicates that no valid restart
 							file was found for factoring-bounds incrementing. */
@@ -1410,18 +1410,18 @@ exit(0);
 		if(passmin > (TF_PASSES-1) )
 		{
 			fprintf(stderr,"ERROR: passmin must be <= %u. Offending entry = %u.\n", TF_PASSES-1, passmin);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		if(passmax < passmin)
 		{
 			fprintf(stderr,"ERROR: (passmax = %u) < (passmin = %u)!\n", passmax, passmin);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 		if(passmax > (TF_PASSES-1) )
 		{
 			fprintf(stderr,"ERROR: passmax must be <= %u. Offending entry = %u.\n", TF_PASSES-1, passmax);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		/**** Process factor candidate bounds: ****/
@@ -1433,16 +1433,16 @@ exit(0);
 	#endif
 		/* Compute kmax if not already set: */
 		if(!kmax) {
-			ASSERT(HERE, bmax <= (nbits_in_p+65), "Specified bmax implies kmax > 64-bit, which exceeds the program's limit ... aborting.");
+			ASSERT(bmax <= (nbits_in_p+65), "Specified bmax implies kmax > 64-bit, which exceeds the program's limit ... aborting.");
 			kmax = given_b_get_k(bmax, two_p, lenQ);
-			ASSERT(HERE, kmax > 0, "Something went wrong with the computation of kmax ... possibly your bmax implies kmax > 64-bit?");
+			ASSERT(kmax > 0, "Something went wrong with the computation of kmax ... possibly your bmax implies kmax > 64-bit?");
 		}
 		if(kmin || bmin) {
 			if(kmin == 0ull) {	/* Lower Bound given in log2rithmic form */
-				ASSERT(HERE, bmin <= bmax, "bmin >= bmax!");
+				ASSERT(bmin <= bmax, "bmin >= bmax!");
 				kmin = given_b_get_k(bmin, two_p, lenQ);
 			} else {
-				ASSERT(HERE, kmin <= kmax, "kmin >= kmax!");
+				ASSERT(kmin <= kmax, "kmin >= kmax!");
 			#ifdef P1WORD
 				fqlo = kmin*twop_float + 1.0;
 				bmin = log(fqlo)*ILG2;
@@ -1453,7 +1453,7 @@ exit(0);
 			fqlo = 1.0;
 		#endif
 		}
-ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!");
+ASSERT(0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!");
 //**** Do savefile-init here? ******
 		if(kmax || bmax) {
 			if(kmax == 0ull) {	/* Upper Bound given in log2rithmic form */
@@ -1465,14 +1465,14 @@ ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax,
 			#endif
 			}
 		} else
-			ASSERT(HERE, 0 ,"factor.c : One of bmax, kmax must be nonzero!");
+			ASSERT(0 ,"factor.c : One of bmax, kmax must be nonzero!");
 
 		/**** At this point the paired elements bmin|kmin, bmax|kmax are in synchrony. ****/
 
 		/* If kplus given on command line, a valid restart file should have been found
 		and kmax incremented at this point, i.e. kplus should have been reset to zero:
 		*/
-		ASSERT(HERE, kplus == 0, "kplus must be zero here!");
+		ASSERT(kplus == 0, "kplus must be zero here!");
 
 		know = kmin;
 		passnow = passmin;
@@ -1483,7 +1483,7 @@ ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax,
 /****************** SIEVE STUFF: *********************/
 /*****************************************************/
 
-	ASSERT(HERE, NUM_SIEVING_PRIME > 0, "factor.c : NUM_SIEVING_PRIME > 0");
+	ASSERT(NUM_SIEVING_PRIME > 0, "factor.c : NUM_SIEVING_PRIME > 0");
 
 /*   allocate the arrays and initialize the array of sieving primes	*/
 	temp_late = (uint64 *)calloc(len, sizeof(uint64));
@@ -1498,12 +1498,12 @@ ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax,
 	bit_map2= (uint64 *)calloc(i * NTHREADS, sizeof(uint64));	// 2nd alloc to give each thread 1 bit-clearable copy of master bit_map
 	if (bit_map == NULL) {
 		fprintf(stderr,"Memory allocation failure for BITMAP array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 	bit_atlas = (uint64 *)calloc(i * TF_PASSES, sizeof(uint64));
 	if (bit_atlas == NULL) {
 		fprintf(stderr,"Memory allocation failure for TEMPLATE array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that in bit_atlas]\n",len,i,TF_PASSES);
 
@@ -1511,51 +1511,51 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that
 	psmall = (uint32 *)calloc(NUM_SIEVING_PRIME * NTHREADS, sizeof(uint32));
 	if (psmall == NULL) {
 		fprintf(stderr,"Memory allocation failure for PSMALL array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
   #endif
 
 	pdiff = (uint8 *)calloc(NUM_SIEVING_PRIME * NTHREADS, sizeof(uint8));
 	if (pdiff == NULL) {
 		fprintf(stderr,"Memory allocation failure for pdiff array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 
 	startval = (uint32 *)calloc(NUM_SIEVING_PRIME * NTHREADS, sizeof(uint32));
 	if (startval == NULL) {
 		fprintf(stderr,"Memory allocation failure for STARTVAL array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 
 	pinv = (uint32 *)calloc(NUM_SIEVING_PRIME, sizeof(uint32));
 	if (pinv == NULL) {
 		fprintf(stderr,"Memory allocation failure for PINV array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 
   #if DBG_SIEVE
 	startval_incr = (uint32 *)calloc(NUM_SIEVING_PRIME, sizeof(uint32));
 	if (startval_incr == NULL) {
 		fprintf(stderr,"Memory allocation failure for STARTVAL_INCR array");
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
   #endif
 
 		/* Check integrity (at least in the sense of monotonicity) for the precomputed pseudoprime table: */
 		for(i = 1; i < 9366; ++i) {
-			ASSERT(HERE, f2psp[i] > f2psp[i-1],"Misplaced pseudoprime!");
+			ASSERT(f2psp[i] > f2psp[i-1],"Misplaced pseudoprime!");
 		}
 
 		/* Test some near-2^32 known-prime cases: */
 		curr_p = (uint32)-5;
 		itmp32 = twopmodq32(curr_p-1, curr_p);
-		ASSERT(HERE, itmp32 == 1,"twopmodq32: 2^32 - 5 test fails!");
+		ASSERT(itmp32 == 1,"twopmodq32: 2^32 - 5 test fails!");
 		curr_p = (uint32)-17;
 		itmp32 = twopmodq32(curr_p-1, curr_p);
-		ASSERT(HERE, itmp32 == 1,"twopmodq32: 2^32 -17 test fails!");
+		ASSERT(itmp32 == 1,"twopmodq32: 2^32 -17 test fails!");
 		curr_p = (uint32)-35;	/* Start of the last length-30 curr_p%30 == 11 interval < 2^32; the 6th candidate in that interval, 2^32-17, is prime */
 		itmp32 = twopmodq32_x8(curr_p, curr_p+ 2, curr_p+ 6, curr_p+ 8, curr_p+12, curr_p+18, curr_p+20, curr_p+26);
-		ASSERT(HERE, itmp32 ==32,"twopmodq32_x8: 2^32 -35 test fails!");
+		ASSERT(itmp32 ==32,"twopmodq32_x8: 2^32 -35 test fails!");
 
 		fprintf(stderr,"Generating difference table of first %u small primes\n", nprime);
 		curr_p = 3;	/* Current prime stored in l. */
@@ -1596,7 +1596,7 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that
 			{
 				if((itmp32 >> j)&0x1)	// It's a PRP, so check against the table of known pseudoprimes and
 				{						// (if it's not a PSP) init for the next gap
-					ASSERT(HERE, curr_p <= f2psp[f2psp_idx],"Error in pseudoprime sieve");
+					ASSERT(curr_p <= f2psp[f2psp_idx],"Error in pseudoprime sieve");
 					if((curr_p + pdsum_8[j]) == f2psp[f2psp_idx])	/* It's a base-2 pseudoprime */
 					{
 						++f2psp_idx;
@@ -1645,13 +1645,13 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that
 
   #if 0
 	// Oct 2015: Play with Smarandache numbers ():
-	i = 2000000;	ASSERT(HERE, i <= nprime, "prime limit exceeded in testSmarandache!");
+	i = 2000000;	ASSERT(i <= nprime, "prime limit exceeded in testSmarandache!");
 	testSmarandache(100001,101000, pdiff, i);
 	exit(0);
   #endif
   #if 0
 	// Oct 2018: Play with "sieve survivors" stats: lim(n --> oo) prod_(p <= n)(1-1/p)/(1/ln(p^2))
-	i = 1000000000;	ASSERT(HERE, i <= MAX_SIEVING_PRIME, "prime limit exceeded in testSieveProdAsymp!");
+	i = 1000000000;	ASSERT(i <= MAX_SIEVING_PRIME, "prime limit exceeded in testSieveProdAsymp!");
 	struct qfloat qfprod = QHALF, qt;
 	double prod = 0.5, log_psq = log((double)i*i);
 	for(m = 0, curr_p = 3; m < nprime; m++) {
@@ -1766,7 +1766,7 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that
 		if(p > curr_p) break;
 		curr_p -= (pdiff[nprime--] << 1);
 	#ifdef FAC_DEBUG
-		ASSERT(HERE, curr_p == prime[nprime], "factor.c : curr_p == prime[nprime]");
+		ASSERT(curr_p == prime[nprime], "factor.c : curr_p == prime[nprime]");
 	#endif
 	}
 	MAX_SIEVING_PRIME = curr_p;
@@ -1792,20 +1792,20 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that
 	#else	// 4620 classes:
 		pass_targ = CHECK_PKMOD4620(p,lenP, k_targ, 0x0) - 1;
 	#endif
-		ASSERT(HERE, (pass_targ < TF_PASSES), "Candidate factor set via k_targ is not a possible factor for this exponent!");
+		ASSERT((pass_targ < TF_PASSES), "Candidate factor set via k_targ is not a possible factor for this exponent!");
 		printf("Target pass for debug-factor = %u\n",pass_targ);
 	}
 
   #endif
 
 	itmp64 = (uint64)mi64_div_y32(p,TF_CLASSES,0x0,lenP);
-//	printf("p %% 60 = %llu\n",itmp64);
+//	printf("p %% 60 = %" PRIu64 "\n",itmp64);
 
   #if TF_CLASSES == 60
 /*
 	const int pmod_vec[] = { 1, 7,11,13,17,19,23,29,31,37,41,43,47,49,53,59, 2,4,8,16,32, 0x0};
 	for(i = 0; pmod_vec[i] != 0; i++) {
-		ASSERT(HERE, CHECK_PKMOD60(pmod_vec[i], k, incr) == 16, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES = 16!\n");
+		ASSERT(CHECK_PKMOD60(pmod_vec[i], k, incr) == 16, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES = 16!\n");
 	}
 	exit(0);
 Mersenne Mp: Acceptable km-values for the 16 possible pm (= p%60) values:
@@ -1833,7 +1833,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
 	pm = 32: 0, 4,10,12,18,22,24,28,30,34,40,42,48,52,54,58
 */
 	i = CHECK_PKMOD60  (&itmp64,1, k, incr);
-	ASSERT(HERE, i == TF_PASSES, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[1,7,11,13,17,19,23,29,31,37,41,43,47,49,53,59] (mod 60).\n");
+	ASSERT(i == TF_PASSES, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[1,7,11,13,17,19,23,29,31,37,41,43,47,49,53,59] (mod 60).\n");
 /*
 	printf("k mod 60 = [");
 	for(i = 0, j = 0; i < 16; i++) {
@@ -1845,7 +1845,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
 */
   #else	// 4620 classes:
 	i = CHECK_PKMOD4620(&itmp64,1, k, incr);
-	ASSERT(HERE, i == TF_PASSES, "CHECK_PKMOD4620 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[960 possible values] (mod 4620).\n");
+	ASSERT(i == TF_PASSES, "CHECK_PKMOD4620 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[960 possible values] (mod 4620).\n");
   #endif
 
 	/* If it's a restart, interval_lo for the initial pass will be based
@@ -1893,7 +1893,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
   #ifdef FAC_DEBUG
 	/* Make sure the range of k's for the run contains any target factor: */
 	if(k_targ)
-		ASSERT(HERE, (kmin <= k_targ) && (kmax >= k_targ),"k_targ not in [kmin, kmax]");
+		ASSERT((kmin <= k_targ) && (kmax >= k_targ),"k_targ not in [kmin, kmax]");
   #endif
 
   #ifdef FACTOR_STANDALONE
@@ -2018,7 +2018,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
 		}
 		/* Should never reach this regular-loop-exit point: */
 		fprintf(stderr,"ERROR: failed to find a multiple of prime %u\n", curr_p);
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 
 	KLOOP:
 		/* Propagate copies of length (regs_todo) bit-cleared portion of sieve to remaining parts of sieve.
@@ -2054,7 +2054,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
   #ifdef FACTOR_STANDALONE
 	 printf(   "TRYQ = %u, max sieving prime = %u\n",TRYQ,MAX_SIEVING_PRIME);
   #else
-	ASSERT(HERE, fp == 0x0,"0");
+	ASSERT(fp == 0x0,"0");
 	fp = mlucas_fopen(STATFILE,"a");
 	fprintf(fp,"TRYQ = %u, max sieving prime = %u\n",TRYQ,MAX_SIEVING_PRIME);
 	fclose(fp); fp = 0x0;
@@ -2121,11 +2121,11 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
 	}	/* end of K loop	*/
 //printf("L3: template word %u [used %u copies] bit_atlas chart %u, word %u, bit %u\n",(uint32)k,ncopies,l,word,bit);	exit(0);
 	// For 60|4620 classes expect to end at bit 15|63 of the last word of each of the TF_PASSES = 16|960 sievelets (a.k.a. charts in our atlas):
-	ASSERT(HERE, (k == 0) && (l == 0), "bit_atlas init: Exit check 1 failed!");
+	ASSERT((k == 0) && (l == 0), "bit_atlas init: Exit check 1 failed!");
   #if TF_CLASSES == 60
-	ASSERT(HERE, (word == 4254) && (bit == 15), "bit_atlas init: Exit check 2 failed!");
+	ASSERT((word == 4254) && (bit == 15), "bit_atlas init: Exit check 2 failed!");
   #else	// 4620 classes:
-	ASSERT(HERE, (word == 3535) && (bit == 63), "bit_atlas init: Exit check 2 failed!");
+	ASSERT((word == 3535) && (bit == 63), "bit_atlas init: Exit check 2 failed!");
   #endif
 
   #ifdef FAC_DEBUG
@@ -2157,27 +2157,27 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values:
 	switch(pmodNC)
 	{
 		/*   p mod 12 = 1:	*/
-		case  1:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case  1"); break;	/* k mod 5 .ne. 2	*/
-		case 37:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==48&&incr[i++]==59&&incr[i++]==60, "factor.c : case 37"); break;	/* k mod 5 .ne. 1	*/
-		case 13:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==47&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==60, "factor.c : case 13"); break;	/* k mod 5 .ne. 4	*/
-		case 49:ASSERT(HERE, incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case 49"); break;	/* k mod 5 .ne. 3	*/
+		case  1:ASSERT(incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case  1"); break;	/* k mod 5 .ne. 2	*/
+		case 37:ASSERT(incr[i++]== 3&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==48&&incr[i++]==59&&incr[i++]==60, "factor.c : case 37"); break;	/* k mod 5 .ne. 1	*/
+		case 13:ASSERT(incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==47&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==60, "factor.c : case 13"); break;	/* k mod 5 .ne. 4	*/
+		case 49:ASSERT(incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case 49"); break;	/* k mod 5 .ne. 3	*/
 		/*   p mod 12 == 7:	*/
-		case 31:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==60, "factor.c : case 31"); break;	/* k mod 5 .ne. 2	*/
-		case  7:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==33&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==57&&incr[i++]==60, "factor.c : case  7"); break;	/* k mod 5 .ne. 1	*/
-		case 43:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==32&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 43"); break;	/* k mod 5 .ne. 4	*/
-		case 19:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 19"); break;	/* k mod 5 .ne. 3	*/
+		case 31:ASSERT(incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==60, "factor.c : case 31"); break;	/* k mod 5 .ne. 2	*/
+		case  7:ASSERT(incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==33&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==57&&incr[i++]==60, "factor.c : case  7"); break;	/* k mod 5 .ne. 1	*/
+		case 43:ASSERT(incr[i++]== 5&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==32&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 43"); break;	/* k mod 5 .ne. 4	*/
+		case 19:ASSERT(incr[i++]== 5&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 19"); break;	/* k mod 5 .ne. 3	*/
 		/*   p mod 12 == 5:	*/
-		case 41:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 4&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==55&&incr[i++]==60, "factor.c : case 41"); break;	/* k mod 5 .ne. 2	*/
-		case 17:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==28&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 17"); break;	/* k mod 5 .ne. 1	*/
-		case 53:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==27&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 53"); break;	/* k mod 5 .ne. 4	*/
-		case 29:ASSERT(HERE, incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 29"); break;	/* k mod 5 .ne. 3	*/
+		case 41:ASSERT(incr[i++]== 3&&incr[i++]== 4&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==55&&incr[i++]==60, "factor.c : case 41"); break;	/* k mod 5 .ne. 2	*/
+		case 17:ASSERT(incr[i++]== 3&&incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==28&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 17"); break;	/* k mod 5 .ne. 1	*/
+		case 53:ASSERT(incr[i++]== 3&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==27&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 53"); break;	/* k mod 5 .ne. 4	*/
+		case 29:ASSERT(incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 29"); break;	/* k mod 5 .ne. 3	*/
 		/*   p mod 12 == 11:	*/
-		case 11:ASSERT(HERE, incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==60, "factor.c : case 11"); break;	/* k mod 5 .ne. 2	*/
-		case 47:ASSERT(HERE, incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==13&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 47"); break;	/* k mod 5 .ne. 1	*/
-		case 23:ASSERT(HERE, incr[i++]== 1&&incr[i++]==12&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 23"); break;	/* k mod 5 .ne. 4	*/
-		case 59:ASSERT(HERE, incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 59"); break;	/* k mod 5 .ne. 3	*/
+		case 11:ASSERT(incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==60, "factor.c : case 11"); break;	/* k mod 5 .ne. 2	*/
+		case 47:ASSERT(incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==13&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 47"); break;	/* k mod 5 .ne. 1	*/
+		case 23:ASSERT(incr[i++]== 1&&incr[i++]==12&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 23"); break;	/* k mod 5 .ne. 4	*/
+		case 59:ASSERT(incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 59"); break;	/* k mod 5 .ne. 3	*/
 		default:
-			ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT,"Only Mersenne and fermat-number factoring supported!");
+			ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT,"Only Mersenne and fermat-number factoring supported!");
 	}
   #endif
 
@@ -2266,10 +2266,10 @@ candidate factors that survive sieving.	*/
 				/* Starting no.-of-times-through-sieve = kmin/(64*len) : */
 				if(pass == passnow && (know > kmin)) {
 					interval_lo = know/((uint64)len << TF_CLSHIFT);
-					ASSERT(HERE, know == interval_lo *(len << TF_CLSHIFT),"know == interval_lo*(len << TF_CLSHIFT)");
+					ASSERT(know == interval_lo *(len << TF_CLSHIFT),"know == interval_lo*(len << TF_CLSHIFT)");
 				} else {
 					interval_lo = kmin/((uint64)len << TF_CLSHIFT);
-					ASSERT(HERE, kmin == interval_lo *(len << TF_CLSHIFT),"kmin == interval_lo*(len << TF_CLSHIFT)");
+					ASSERT(kmin == interval_lo *(len << TF_CLSHIFT),"kmin == interval_lo*(len << TF_CLSHIFT)");
 				}
 			} else {
 				interval_lo = interval_hi;	// This is what defines a 'no-op' pool task.
@@ -2277,9 +2277,9 @@ candidate factors that survive sieving.	*/
 			/* Set initial k for this pass to default value (= incr[pass]) + interval_lo*(64*len),
 			(assume this could be as large as 64 bits), then use it to set initial q for this pass:
 			*/
-			ASSERT(HERE, (double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT");
+			ASSERT((double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT");
 			k = (uint64)incr[pass] + interval_lo*(len << TF_CLSHIFT);
-		//	fprintf(stderr," [*** Init pass %u data: k0 = %llu, word0 = %16llX\n",pass,k,bit_map[0]);
+		//	fprintf(stderr," [*** Init pass %u data: k0 = %" PRIu64 ", word0 = %16" PRIX64 "\n",pass,k,bit_map[0]);
 			struct fac_thread_data_t* targ = tdat + thr_id;
 			targ->count = &count;
 			targ->tid = thr_id;		// Within the per-thread TFing, only the pool-thread ID matters
@@ -2344,14 +2344,14 @@ candidate factors that survive sieving.	*/
 		#if 0
 			printf("adding pool task %d with pool ID [%d]\n",thr_id,((struct thread_init *)(&task_control)->data)->thread_num);
 			struct fac_thread_data_t* targ = tdat + thr_id;
-			printf("This task has: pass %u, interval_[lo,hi] = [%llu,%llu]\n",targ->pass,targ->interval_lo,targ->interval_hi);
+			printf("This task has: pass %u, interval_[lo,hi] = [%" PRIu64 ",%" PRIu64 "]\n",targ->pass,targ->interval_lo,targ->interval_hi);
 			printf("; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 		#endif
 		}
 
 		while(tpool->free_tasks_queue.num_tasks != NTHREADS) {
 			// Posix sleep() too granular here; use finer-resolution, declared in <time.h>; cf. http://linux.die.net/man/2/nanosleep
-			ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+			ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 		}
 		fprintf(stderr,"\n");	// For pretty-printing, have the inline-pass-printing reflect || work, newlines reflect sync-points
 	};	// wave-loop
@@ -2374,7 +2374,7 @@ candidate factors that survive sieving.	*/
 		/* If debugging sieve, make sure critical bit hasn't been cleared: */
 		if( k_targ && (((bit_map[i64_targ] >> bit_targ) & 1) == 0) ) {
 			fprintf(stderr,"Critical bit cleared in master bitmap!\n");
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	#endif
 
@@ -2383,7 +2383,7 @@ candidate factors that survive sieving.	*/
 			printf("pass = %u",pass);	fflush(stdout);
 		}
 	#else
-		ASSERT(HERE, fp == 0x0,"0");
+		ASSERT(fp == 0x0,"0");
 		fp = mlucas_fopen(STATFILE,"a");
 		fprintf(fp,"Starting Trial-factoring Pass %2u...\n",pass);
 		fclose(fp); fp = 0x0;
@@ -2392,16 +2392,16 @@ candidate factors that survive sieving.	*/
 		/* Starting no.-of-times-through-sieve = kmin/(64*len) : */
 		if(pass == passnow && (know > kmin)) {
 			interval_lo = know/((uint64)len << TF_CLSHIFT);
-			ASSERT(HERE, know == interval_lo*((uint64)len << TF_CLSHIFT),"know == interval_lo*((uint64)len << TF_CLSHIFT)");
+			ASSERT(know == interval_lo*((uint64)len << TF_CLSHIFT),"know == interval_lo*((uint64)len << TF_CLSHIFT)");
 		} else {
 			interval_lo = kmin/((uint64)len << TF_CLSHIFT);
-			ASSERT(HERE, kmin == interval_lo*((uint64)len << TF_CLSHIFT),"kmin == interval_lo*((uint64)len << TF_CLSHIFT)");
+			ASSERT(kmin == interval_lo*((uint64)len << TF_CLSHIFT),"kmin == interval_lo*((uint64)len << TF_CLSHIFT)");
 		}
 
 		/* Set initial k for this pass to default value (= incr[pass]) + interval_lo*(64*len),
 		(assume this could be as large as 64 bits), then use it to set initial q for this pass:
 		*/
-		ASSERT(HERE, (double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT");
+		ASSERT((double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT");
 		k = (uint64)incr[pass] + interval_lo*(len << TF_CLSHIFT);
 
 		i = nprime;	// Remember, MAX_SIEVING_PRIME is a *variable* and set at runtime, as opposed to the predef NUM_SIEVING_PRIME;
@@ -2439,7 +2439,7 @@ candidate factors that survive sieving.	*/
 		if(cudaError != cudaSuccess)
 		{
 			printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
-			ASSERT(HERE, 0, "factor.c : GPU-side error detected!");
+			ASSERT(0, "factor.c : GPU-side error detected!");
 		}
 	#endif
 
@@ -2465,14 +2465,14 @@ candidate factors that survive sieving.	*/
   #ifdef FACTOR_STANDALONE
 	if(!restart)
 	{
-		printf(   "%s(%s) has %u factors in range k = [%llu, %llu], passes %u-%u\n",
+		printf(   "%s(%s) has %u factors in range k = [%" PRIu64 ", %" PRIu64 "], passes %u-%u\n",
 	 	NUM_PREFIX[MODULUS_TYPE], pstring, nfactor, kmin, kmax, passmin, passmax);
 		printf(   "Performed %s trial divides\n", &char_buf0[convert_uint64_base10_char(char_buf0, count)]);
 		/* Since we're done accumulating cycle count, divide to get total time in seconds: */
 		printf(   "Clocks =%s\n",get_time_str(tdiff));
 	}
   #else
-	ASSERT(HERE, fp == 0x0,"0");
+	ASSERT(fp == 0x0,"0");
 	fp = mlucas_fopen(STATFILE,"a");
 	fprintf(fp,"Performed %s trial divides\n", &char_buf0[convert_uint64_base10_char(char_buf0, count)]);
 	/* Since we're done accumulating cycle count, divide to get total time in seconds: */
@@ -2482,9 +2482,9 @@ candidate factors that survive sieving.	*/
 
 	fp = mlucas_fopen(   OFILE,"a");
   #ifdef P1WORD
-	 fprintf(fp,"M(%s) has %u factors in range k = [%llu, %llu], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax);
+	 fprintf(fp,"M(%s) has %u factors in range k = [%" PRIu64 ", %" PRIu64 "], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax);
   #else
-	 fprintf(fp,"M(%s) has %u factors in range k = [%llu, %llu], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax);
+	 fprintf(fp,"M(%s) has %u factors in range k = [%" PRIu64 ", %" PRIu64 "], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax);
   #endif
 	fclose(fp); fp = 0x0;
 
@@ -2492,7 +2492,7 @@ candidate factors that survive sieving.	*/
 	/* If a test factor was given, make sure we found at least one factor: */
 	if(k_targ > 0)
 	{
-		ASSERT(HERE, nfactor > 0,"k_targ > 0 but failed to find at least one factor");
+		ASSERT(nfactor > 0,"k_targ > 0 but failed to find at least one factor");
 	}
   #endif
 
@@ -2566,7 +2566,7 @@ candidate factors that survive sieving.	*/
   #endif
 	/* If we reached here other than via explicit invocation of the help menu, assert: */
 	if(!STREQ(stFlag, "-h"))
-		ASSERT(HERE, 0,"Mfactor: Unrecognized command-line option!");
+		ASSERT(0,"Mfactor: Unrecognized command-line option!");
 	return(0);
   #endif
 }
@@ -2686,7 +2686,7 @@ candidate factors that survive sieving.	*/
 	#ifdef P4WORD
 		uint256 p256,q256,t256;
 	#endif
-		char cbuf[STR_MAX_LEN], cbuf2[STR_MAX_LEN];
+		char cbuf[STR_MAX_LEN*2], cbuf2[STR_MAX_LEN*2];
 	#ifdef CTIME
 		clock_t clock1, clock2;
 	#else	// Multithreaded needs wall-clock, not CPU time:
@@ -2709,27 +2709,27 @@ candidate factors that survive sieving.	*/
 			itmp = fscanf(fp,"%s\n",cstr);
 			if(itmp <= 0 || !STREQ(cstr,pstring)) {
 				sprintf(char_buf0,"Line 1 entry found in factoring savefile [%s] does not match exponent of run [%s].",cstr,pstring);
-				ASSERT(HERE,0,char_buf0);
+				ASSERT(0,char_buf0);
 			}
 			itmp = fscanf(fp,"%u\n",&i  );
 			if(itmp <= 0 || i != TF_PASSES      ) {
 				sprintf(char_buf0,"Line 1 entry found in factoring savefile [%d] does not match exponent of run [%d].",i,TF_PASSES);
-				ASSERT(HERE,0,char_buf0);
+				ASSERT(0,char_buf0);
 			}
 			// See if restart file has a pass/max-k-reached entry matching the current pass:
 			while(fgets(cstr,STR_MAX_LEN,fp)) {
 				if((char_addr = strstr(cstr,"Pass ")) != 0) {
 					itmp = sscanf(char_addr,"%u",i);
 					if(itmp <= 0) {
-						fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(HERE, 0,"0");
+						fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(0,"0");
 					}
 					if(i == pass) {	// Is the pass index the one we are updating? If yes, update the k-value
-						ASSERT(HERE, !found_pass, "Multiple current-pass entry found in savefile!");
+						ASSERT(!found_pass, "Multiple current-pass entry found in savefile!");
 						found_pass = TRUE;
 						// Read the max-k-reached value
-						ASSERT(HERE,((char_addr = strstr(cstr,"Pass ")) != 0),"Expected : following pass number not found!");
-						itmp = sscanf(char_addr,"%llu",k);
-						ASSERT(HERE,itmp >= 0,"Unable to read max-k-reached value!");
+						ASSERT(((char_addr = strstr(cstr,"Pass ")) != 0),"Expected : following pass number not found!");
+						itmp = sscanf(char_addr,"%" PRIu64,k);
+						ASSERT(itmp >= 0,"Unable to read max-k-reached value!");
 						// Even if valid entry found, process rest of file to ensure no duplicate-pass-number entries
 					}
 				}
@@ -2737,7 +2737,7 @@ candidate factors that survive sieving.	*/
 			/* pstring*/
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (current exponent) of factoring restart file %s!\n", curr_line, RESTARTFILE);		ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (current exponent) of factoring restart file %s!\n", curr_line, RESTARTFILE);		ASSERT(0,"0");
 			}
 			/* Strip the expected newline char from in_line: */
 			char_addr = strstr(in_line, "\n");
@@ -2745,23 +2745,23 @@ candidate factors that survive sieving.	*/
 				*char_addr = '\0';
 			/* Make sure restart-file and current-run pstring match: */
 			if(STRNEQ(in_line, pstring)) {
-				fprintf(stderr,"ERROR: current exponent %s != Line %d of factoring restart file %s!\n",pstring, curr_line, RESTARTFILE);		ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: current exponent %s != Line %d of factoring restart file %s!\n",pstring, curr_line, RESTARTFILE);		ASSERT(0,"0");
 			}
 
 			/* bmin */
 			++curr_line;
-			fgets(cbuf, STR_MAX_LEN, fp);
+			fgets(cbuf, STR_MAX_LEN*2, fp);
 			itmp = sscanf(cbuf, "%lf", &bmin_file);
 			if(itmp != 1) {
-				fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf);		ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf);		ASSERT(0,"0");
 			}
 
 			/* bmax */
 			++curr_line;
-			fgets(cbuf, STR_MAX_LEN, fp);
+			fgets(cbuf, STR_MAX_LEN*2, fp);
 			itmp = sscanf(cbuf, "%lf", &bmax_file);
 			if(itmp != 1) {
-				fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf);		ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf);		ASSERT(0,"0");
 			}
 
 		/************************************
@@ -2776,7 +2776,7 @@ candidate factors that survive sieving.	*/
 	GET_LINE4:
 		/**** redo this ****/
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: 'KMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: 'KMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "KMin");
 			/* Since the preceding fscanf call may leave us at the end of curr_line-1
@@ -2787,7 +2787,7 @@ candidate factors that survive sieving.	*/
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				kmin_file = convert_base10_char_uint64(char_addr);
@@ -2796,15 +2796,15 @@ candidate factors that survive sieving.	*/
 			/* KNow */
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (KNow) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (KNow) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "KNow");
 			if(!char_addr) {
-				fprintf(stderr,"ERROR: 'KNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: 'KNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				know_file = convert_base10_char_uint64(char_addr);
@@ -2813,15 +2813,15 @@ candidate factors that survive sieving.	*/
 			/* KMax */
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (KMax) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (KMax) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "KMax");
 			if(!char_addr) {
-				fprintf(stderr,"ERROR: 'KMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: 'KMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				kmax_file = convert_base10_char_uint64(char_addr);
@@ -2830,71 +2830,71 @@ candidate factors that survive sieving.	*/
 			/* PassMin */
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (PassMin) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (PassMin) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "PassMin");
 			if(!char_addr) {
-				fprintf(stderr,"ERROR: 'PassMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: 'PassMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				passmin_file = (uint32)convert_base10_char_uint64(char_addr);
-				ASSERT(HERE, passmin_file < TF_PASSES,"factor.c: passmin < TF_PASSES");
+				ASSERT(passmin_file < TF_PASSES,"factor.c: passmin < TF_PASSES");
 			}
 
 			/* PassNow */
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (PassNow) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (PassNow) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "PassNow");
 			if(!char_addr) {
-				fprintf(stderr,"ERROR: 'PassNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: 'PassNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				passnow_file = (uint32)convert_base10_char_uint64(char_addr);
-				ASSERT(HERE, passnow_file < TF_PASSES,"factor.c: passnow < TF_PASSES");
-				ASSERT(HERE, passnow_file >= passmin_file  ,"factor.c: passnow_file >= passmin_file");
+				ASSERT(passnow_file < TF_PASSES,"factor.c: passnow < TF_PASSES");
+				ASSERT(passnow_file >= passmin_file  ,"factor.c: passnow_file >= passmin_file");
 			}
 
 			/* PassMax */
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (PassMax) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (PassMax) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "PassMax");
 			if(!char_addr) {
-				fprintf(stderr,"ERROR: 'PassMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: 'PassMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				passmax_file = (uint32)convert_base10_char_uint64(char_addr);
-				ASSERT(HERE, passmax_file < TF_PASSES,"factor.c: passmax_file < TF_PASSES");
-				ASSERT(HERE, passmax_file >= passnow_file  ,"factor.c: passmax_file >= passnow_file");
+				ASSERT(passmax_file < TF_PASSES,"factor.c: passmax_file < TF_PASSES");
+				ASSERT(passmax_file >= passnow_file  ,"factor.c: passmax_file >= passnow_file");
 			}
 
 			/* Number of q's tried: */
 			++curr_line;
 			if(!fgets(in_line, STR_MAX_LEN, fp)) {
-				fprintf(stderr,"ERROR: unable to read Line %d (#Q tried) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: unable to read Line %d (#Q tried) of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			}
 			char_addr = strstr(in_line, "#Q tried");
 			if(!char_addr) {
-				fprintf(stderr,"ERROR: '#Q tried' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: '#Q tried' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 			} else {
 				char_addr = strstr(in_line, "=");
 				if(!char_addr) {
-					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE);	ASSERT(0,"0");
 				}
 				char_addr++;
 				count = convert_base10_char_uint64(char_addr);	// Need to reset == 0 prior to sieving so kvector-fill code works properly
@@ -2939,12 +2939,12 @@ candidate factors that survive sieving.	*/
 			if(bmin || bmax)
 			{
 			#if(!defined(P1WORD))
-			//	ASSERT(HERE, 0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!");
+			//	ASSERT(0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!");
 			#endif
-				ASSERT(HERE, (kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run.");
+				ASSERT((kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run.");
 
 				if(bmin) {
-					ASSERT(HERE, bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file");
+					ASSERT(bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file");
 					if(bmin < bmax_file)
 						fprintf(stderr,"WARNING: Specified bmin (%lf) smaller than previous-run bmax = %lf. Setting equal to avoid overlapping runs.\n", bmin, bmax_file);
 				}
@@ -2952,7 +2952,7 @@ candidate factors that survive sieving.	*/
 
 				/* We expect any command-line bmax will be > that in the restart file: */
 				if(bmax)
-					ASSERT(HERE, bmax > bmax_file - 0.0000000001,"bmax >= bmax_file");
+					ASSERT(bmax > bmax_file - 0.0000000001,"bmax >= bmax_file");
 			}
 
 			/****
@@ -2963,10 +2963,10 @@ candidate factors that survive sieving.	*/
 			****/
 			if(kmin || kmax)
 			{
-				ASSERT(HERE, (bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)");
+				ASSERT((bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)");
 
 				if(kmin) {
-					ASSERT(HERE, kmin >= kmin_file,"kmin >= kmin_file");
+					ASSERT(kmin >= kmin_file,"kmin >= kmin_file");
 					if(kmin < kmax_file)
 						fprintf(stderr,"WARNING: Specified kmin (%s) smaller than previous-run kmax = %s. Setting equal to avoid overlapping runs.\n", &char_buf0[convert_uint64_base10_char(char_buf0, kmax)], &char_buf1[convert_uint64_base10_char(char_buf1, kmax_file)]);
 				}
@@ -2974,7 +2974,7 @@ candidate factors that survive sieving.	*/
 
 				/* We expect any command-line kmax will be > that in the restart file: */
 				if(kmax)
-					ASSERT(HERE, kmax > kmax_file,"kmax >= kmax_file");
+					ASSERT(kmax > kmax_file,"kmax >= kmax_file");
 			}
 
 			/****
@@ -2982,11 +2982,11 @@ candidate factors that survive sieving.	*/
 			****/
 			if(kplus)
 			{
-				ASSERT(HERE, (bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)");
+				ASSERT((bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)");
 
 				kmin = kmax_file;
 				/* Ensure incremented value kmax fits into a 64-bit unsigned int: */
-				ASSERT(HERE, (kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!");
+				ASSERT((kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!");
 				kmax = kmin + kplus;
 				kplus = 0;	/* If kplus != 0 detected further on, that indicates that no valid restart
 							file was found for factoring-bounds incrementing. */
@@ -3016,12 +3016,12 @@ candidate factors that survive sieving.	*/
 
 	  #ifdef FAC_DEBUG
 		// compute qstart = 2.kstart.p + 1:
-		ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
+		ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
 		q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 		printf(" Initial q for this pass = %s.\n", &char_buf0[convert_mi64_base10_char(char_buf0, q, lenQ, 0)]);
 	  #endif
 //if(pass==4)
-//	printf("\nPass %u: k0 = %u, word0 prior to deep-prime clearing = %16llX\n",pass,(uint32)kstart,bit_map[0]);
+//	printf("\nPass %u: k0 = %u, word0 prior to deep-prime clearing = %16" PRIX64 "\n",pass,(uint32)kstart,bit_map[0]);
 		// Compute startbit k (occurrence of first multiple of prime curr_p in first pass through the relevant sievelet:
 		if((lenP == 1) && (p[0] <= MAX_SIEVING_PRIME))
 			get_startval(MODULUS_TYPE, p[0], findex, two_p, lenQ, bit_len, interval_lo, incr, nclear, nprime, p_last_small, pdiff, startval);
@@ -3033,7 +3033,7 @@ candidate factors that survive sieving.	*/
 #ifdef MULTITHREAD
 //if(tid == 0)
 #endif
-//	printf("sweep %llu: k0 = %llu, count %llu: k0-3 = %llu,%llu,%llu,%llu\n",sweep,kstart,count,k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]);
+//	printf("sweep %" PRIu64 ": k0 = %" PRIu64 ", count %" PRIu64 ": k0-3 = %" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",sweep,kstart,count,k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]);
 
 			/* Accumulate the cycle count every so often to avoid problems with integer overflow
 			of the clock() result, if clock_t happens to be a 32-bit int type on the host platform:
@@ -3068,7 +3068,7 @@ candidate factors that survive sieving.	*/
 			fbits_in_k = log((double)k + TF_CLASSES*bit_len)*ILG2;	// Use k-value at end of upcoming pass thru sieve as upper-bound
 			fbits_in_q = fbits_in_2p + fbits_in_k;
 	//	if(fbits_in_q > 64)
-	//	printf("sweep = %llu: fbits_in_q = fbits_in_2p [%10.4f] + fbits_in_k [%10.4f] = %10.4f\n",sweep,fbits_in_2p,fbits_in_k,fbits_in_q);
+	//	printf("sweep = %" PRIu64 ": fbits_in_q = fbits_in_2p [%10.4f] + fbits_in_k [%10.4f] = %10.4f\n",sweep,fbits_in_2p,fbits_in_k,fbits_in_q);
 
 		/*********************************************/
 		#if DBG_SIEVE
@@ -3083,7 +3083,7 @@ candidate factors that survive sieving.	*/
 				if((k <= k_targ) && (k_targ < (k+(sieve_len<<6))))
 				{
 					itmp64 = k_targ - k;
-					ASSERT(HERE, itmp64%TF_CLASSES == 0,"(k_targ - k)%TF_CLASSES == 0");
+					ASSERT(itmp64%TF_CLASSES == 0,"(k_targ - k)%TF_CLASSES == 0");
 					itmp64 /= TF_CLASSES;
 					i64_targ = itmp64 >> 6;
 					bit_targ = itmp64 & 63;
@@ -3139,7 +3139,7 @@ candidate factors that survive sieving.	*/
 				curr_p += (pdiff[m] << 1);
 				if(curr_p > bit_len && !((nprime - m)&63)) {	// 2nd clause is to make Loop #2 count a multiple of 64
 					curr_p -= (pdiff[m] << 1);
-					ASSERT(HERE, curr_p < p[0],"On Loop 1 exit: curr_p >= p!");
+					ASSERT(curr_p < p[0],"On Loop 1 exit: curr_p >= p!");
 					break;
 				}
 				l = startval[m];
@@ -3225,15 +3225,15 @@ candidate factors that survive sieving.	*/
 					#endif
 					#if DBG_SIEVE
 						if(k_targ && l == (i64_targ*64 + bit_targ)) {
-							fprintf(stderr,"Critical bit being cleared by prime %u, with offset %u\n", curr_p, startval[m]);	ASSERT(HERE, 0,"0");
+							fprintf(stderr,"Critical bit being cleared by prime %u, with offset %u\n", curr_p, startval[m]);	ASSERT(0,"0");
 						}
 					#endif
 						l += curr_p;
 					}
 					/*...save new startvalue:	*/
 				#if DBG_SIEVE
-					ASSERT(HERE, (startval[m] + startval_incr[m]) < (curr_p + curr_p), "factor.c : (startval[m] + startval_incr[m]) < (curr_p + curr_p)");
-					ASSERT(HERE, l-bit_len == (startval[m] + startval_incr[m])%curr_p, "factor.c : l-bit_len == (startval[m] + startval_incr[m])%curr_p");
+					ASSERT((startval[m] + startval_incr[m]) < (curr_p + curr_p), "factor.c : (startval[m] + startval_incr[m]) < (curr_p + curr_p)");
+					ASSERT(l-bit_len == (startval[m] + startval_incr[m])%curr_p, "factor.c : l-bit_len == (startval[m] + startval_incr[m])%curr_p");
 				#endif
 					startval[m] = l-bit_len;
 				}
@@ -3241,7 +3241,7 @@ candidate factors that survive sieving.	*/
 
 		#endif	// USE_AVX512 ?
 
-//	if(pass==4)printf("\nPass %u: word0 after deep-prime clearing = %16llX\n",pass,bit_map2[0]);
+//	if(pass==4)printf("\nPass %u: word0 after deep-prime clearing = %16" PRIX64 "\n",pass,bit_map2[0]);
 
 			// Now run through the bits of the current copy of the sieve, trial dividing if a bit = 1:
 		  #if TF_CLASSES == 60
@@ -3249,7 +3249,7 @@ candidate factors that survive sieving.	*/
 		  #else
 			ihi = (sieve_len*64)/TF_CLASSES;	// 64*sieve_len divisible by TF_CLASSES, no need for padding
 		  #endif
-			ASSERT(HERE, ihi == ((bit_len+63)>>6), "Ihi value-check failed!");
+			ASSERT(ihi == ((bit_len+63)>>6), "Ihi value-check failed!");
 		#ifdef FAC_DEBUG
 			m = 0;	// accum popc
 			for(i = 0; i < ihi; i++) {
@@ -3262,7 +3262,7 @@ candidate factors that survive sieving.	*/
 		  #ifdef MULTITHREAD
 			if(tid == 0)
 		  #endif
-			printf("%u [%6.2f%%] survived; count = %llu\n",m,100.*(float)m/bit_len,count);
+			printf("%u [%6.2f%%] survived; count = %" PRIu64 "\n",m,100.*(float)m/bit_len,count);
 		#endif
 
 			bit_hi = 64;
@@ -3276,13 +3276,13 @@ candidate factors that survive sieving.	*/
 				{
 				#ifdef FAC_DEBUG
 					/* If a known factor is specified, here it is in the bitmap: */
-					if(ABS((int64)(k-k_targ)) < 1000) printf("Trying k = %llu\n",k);
+					if(ABS((int64)(k-k_targ)) < 1000) printf("Trying k = %" PRIu64 "\n",k);
 					if(k == k_targ) {
 						printf("here it is: sweep = %s, bitmap word = %u, bit = %3u\n", &cbuf[convert_uint64_base10_char(cbuf, sweep)], i, bit);
 						if((bit_map2[i] >> bit) & 1)
-							printf("Trying k_targ = %llu...\n", k_targ);
+							printf("Trying k_targ = %" PRIu64 "...\n", k_targ);
 						else
-							ASSERT(HERE, 0,"0");
+							ASSERT(0,"0");
 					}
 				#endif
 
@@ -3301,16 +3301,16 @@ candidate factors that survive sieving.	*/
 						*/
 						if((count & countmask) == 0)
 						{
-							fprintf(stderr,"[k = %llu]",k);
+							fprintf(stderr,"[k = %" PRIu64 "]",k);
 						#ifdef MULTITHREAD
 							pthread_mutex_lock(&mutex_mi64);
 						//	printf("Count = %u * 2^%u checkpoint: Thread %u locked mutex_mi64 ... ",(uint32)(count >> CMASKBITS),CMASKBITS,tid);
 						#endif
 							fp = mlucas_fopen(OFILE,"a");
-							ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
+							ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
 							q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 						#ifdef FAC_DEBUG
-							sprintf(cbuf, " Count = %u * 2^%u: k = %llu, Current q = %s\n",
+							sprintf(cbuf, " Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s\n",
 								(uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]);
 							fprintf(stderr, "%s", cbuf);
 						#endif
@@ -3323,9 +3323,9 @@ candidate factors that survive sieving.	*/
 							if(MODULUS_TYPE == MODULUS_TYPE_MERSMERS) {
 								res = (mi64_twopmodq_qmmp(findex, k, u64_arr) == 1);
 								if(res != (mi64_twopmodq(p, lenP, k, q, lenQ, q2) == 1) || q2[0] != u64_arr[0]) {
-									sprintf(cbuf, "ERROR: Spot-check k = %llu, Results of mi64_twopmodq_qmmp and mi64_twopmodq differ!\n", k);
+									sprintf(cbuf, "ERROR: Spot-check k = %" PRIu64 ", Results of mi64_twopmodq_qmmp and mi64_twopmodq differ!\n", k);
 									fprintf(fp,"%s", cbuf);
-									ASSERT(HERE, 0, cbuf);
+									ASSERT(0, cbuf);
 								}
 							}
 
@@ -3333,18 +3333,18 @@ candidate factors that survive sieving.	*/
 							mi64_clear(u64_arr, lenQ);	// Use q2 for quotient [i.e. factor-candidate k] and u64_arr for remainder
 							mi64_div(q,two_p,lenQ,lenQ,q2,u64_arr);
 							if(mi64_getlen(q2, lenQ) != 1) {
-								sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %llu, Current q = %s: k must be 64-bit!\n",
+								sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s: k must be 64-bit!\n",
 									(uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]);
 								fprintf(fp,"%s", cbuf);
-								ASSERT(HERE, 0, cbuf);
+								ASSERT(0, cbuf);
 							}
 							if(!mi64_cmp_eq_scalar(u64_arr, 1ull, lenQ))
 							{
-								sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %llu, Current q = %s: q mod (2p) = %s != 1!\n",
+								sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s: q mod (2p) = %s != 1!\n",
 									(uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)],
 									&cbuf2[convert_mi64_base10_char(cbuf2, u64_arr, lenQ, 0)]);
 								fprintf(fp,"%s", cbuf);
-								ASSERT(HERE, 0, cbuf);
+								ASSERT(0, cbuf);
 							}
 
 							/* If q is composite [only check this in debug mode since it costs more than checking
@@ -3354,7 +3354,7 @@ candidate factors that survive sieving.	*/
 							mi64_sub_scalar(q2,1ull,q2,lenQ);	// Re-use q2 to store q-1
 							if(mi64_twopmodq(q2, lenQ, 0, q, lenQ, 0x0) != 1) {
 							#if SPOT_CHECK
-								printf(" INFO: Spot-check q with k = %llu is composite\n",k);
+								printf(" INFO: Spot-check q with k = %" PRIu64 " is composite\n",k);
 							#endif
 								l = 3;
 								for(m = 0; m < nprime; m++) {
@@ -3364,21 +3364,21 @@ candidate factors that survive sieving.	*/
 									if(mi64_is_div_by_scalar32((uint32 *)q, l, lenQ)) {
 									#ifdef MULTITHREAD
 									//	if(tid != 0) break;	// Can make thread-specific by fiddling the rhs of the !=
-										printf("Thread %u, k = %llu: q = ",tid,k);
-										if(lenQ > 1)printf("2^64 * %llu + ",q[1]);
-										printf("%llu has a small divisor: %u\n",q[0], l);
-										ASSERT(HERE, 0, "Abort...");
+										printf("Thread %u, k = %" PRIu64 ": q = ",tid,k);
+										if(lenQ > 1)printf("2^64 * %" PRIu64 " + ",q[1]);
+										printf("%" PRIu64 " has a small divisor: %u\n",q[0], l);
+										ASSERT(0, "Abort...");
 									#else
-										sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %llu, Current q = %s has a small divisor: %u\n",
+										sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s has a small divisor: %u\n",
 											(uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)],l);
 										fprintf(fp,"%s", cbuf);
-										ASSERT(HERE, 0, cbuf);
+										ASSERT(0, cbuf);
 									#endif
 									}
 								}
 							} else {
 							#if SPOT_CHECK
-								printf(" INFO: Spot-check q with k = %llu is base-2 PRP\n",k);
+								printf(" INFO: Spot-check q with k = %" PRIu64 " is base-2 PRP\n",k);
 							#endif
 							}
 							fclose(fp); fp = 0x0;
@@ -3413,35 +3413,35 @@ candidate factors that survive sieving.	*/
 									if(k < 1000) {
 										printf("Do deep sieving for k = %u\n",(uint32)k);
 									/****** Apr 2105: This all needs to be made thread-safe ******/
-									ASSERT(HERE, 0, "This all needs to be made thread-safe!");
+									ASSERT(0, "This all needs to be made thread-safe!");
 										kdeep[*ndeep++] = (uint32)k;
-										ASSERT(HERE, *ndeep < 1024, "Increase allocation of kdeep[] array or use deeper sieving bound to reduce #candidate k's!");
+										ASSERT(*ndeep < 1024, "Increase allocation of kdeep[] array or use deeper sieving bound to reduce #candidate k's!");
 									//	itmp64 = factor_qmmp_sieve64((uint32)findex, k, MAX_SIEVING_PRIME+2, 0x0001000000000000ull);
 									//	if(itmp64) {
-									//		printf("Q( k = %u ) has a small factor: %20llu\n",(uint32)k, itmp64);
+									//		printf("Q( k = %u ) has a small factor: %20" PRIu64 "\n",(uint32)k, itmp64);
 									//	}
 									}
 									res = 0;
 								} else {
-									ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
+									ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
 									q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 									res = (mi64_twopmodq_qmmp(findex, k, u64_arr) == 1);
 								// Uncomment to debug by comparing the results of the slow and fast-MMp-optimized modmul routines
 								/*
 									if(res != (mi64_twopmodq(p, lenP, k, q, lenQ, q2) == 1) || q2[0] != u64_arr[0]) {
-										ASSERT(HERE, 0, "bzzt!");
+										ASSERT(0, "bzzt!");
 									}
 								*/
 								}
 							} else {
-								ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
+								ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!");
 								q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 								res = mi64_twopmodq(p, lenP, k, q, lenQ, u64_arr);
 							}
 
 						  #elif(defined(P4WORD))
 
-							ASSERT(HERE, 0ull == mi64_mul_scalar(two_p,k,(uint64*)&q256,lenQ), "2.k.p overflows!");
+							ASSERT(0ull == mi64_mul_scalar(two_p,k,(uint64*)&q256,lenQ), "2.k.p overflows!");
 							q256.d0 += 1;	// No need to check for carry since 2.k.p even
 							p256.d0 = p[0]; p256.d1 = p[1]; p256.d2 = p[2]; p256.d3 = p[3];
 							t256 = twopmodq256(p256,q256);
@@ -3451,12 +3451,12 @@ candidate factors that survive sieving.	*/
 
 						   #ifdef USE_FLOAT
 
-							ASSERT(HERE, !p[2], "twopmodq200: p[2] nonzero!");
+							ASSERT(!p[2], "twopmodq200: p[2] nonzero!");
 							x256 = twopmodq200_8WORD_qmmp(p,k);	res = (uint64)CMPEQ256(x256, ONE256);
 
 						   #else
 
-							ASSERT(HERE, 0ull == mi64_mul_scalar(two_p,k,(uint64*)&q192,lenQ), "2.k.p overflows!");
+							ASSERT(0ull == mi64_mul_scalar(two_p,k,(uint64*)&q192,lenQ), "2.k.p overflows!");
 							q192.d0 += 1;	// No need to check for carry since 2.k.p even
 							p192.d0 = p[0]; p192.d1 = p[1]; p192.d2 = p[2];
 							t192 = twopmodq192(p192,q192);
@@ -3505,7 +3505,7 @@ candidate factors that survive sieving.	*/
 							  #endif
 								else
 								{
-									ASSERT(HERE, fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!");
+									ASSERT(fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!");
 								  #if USE_128x96 == 1
 									/* Use strictly  96-bit routines: */
 									res = twopmodq96	(p[0],k);
@@ -3514,7 +3514,7 @@ candidate factors that survive sieving.	*/
 									res = twopmodq128_96(p[0],k);
 								  #else
 									/* Use fully 128-bit routines: */
-								//	ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,(uint64*)&q128,lenQ), "2.k.p overflows!");
+								//	ASSERT(0 == mi64_mul_scalar(two_p,k,(uint64*)&q128,lenQ), "2.k.p overflows!");
 									res = twopmodq128x2(p,k);
 								  #endif
 								}
@@ -3531,7 +3531,7 @@ candidate factors that survive sieving.	*/
 								#error	TRYQ = 2 / P3WORD only allowed if USE_FLOAT is defined!
 							#endif	/* #ifdef USE_FMADD */
 
-							ASSERT(HERE, !p[2], "twopmodq200: p[2] nonzero!");
+							ASSERT(!p[2], "twopmodq200: p[2] nonzero!");
 							res  = twopmodq200_8WORD_qmmp_x2_sse2(p,k_to_try[0],k_to_try[1]);
 
 						  #elif(defined(P1WORD))
@@ -3558,7 +3558,7 @@ candidate factors that survive sieving.	*/
 
 						  #ifdef P3WORD
 
-						//	ASSERT(HERE, !p[2], "twopmodq200: p[2] nonzero!");
+						//	ASSERT(!p[2], "twopmodq200: p[2] nonzero!");
 							res = twopmodq192_q4(p,k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]);
 
 						  #elif(defined(P2WORD))
@@ -3609,7 +3609,7 @@ candidate factors that survive sieving.	*/
 									res = twopmodq72_q4(p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]);
 							  #endif
 								else {
-									ASSERT(HERE, fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!");
+									ASSERT(fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!");
 								#if USE_128x96 == 1
 									/* Use strictly  96-bit routines: */
 									res = twopmodq96_q4		(p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3], 0,tid);
@@ -3670,7 +3670,7 @@ candidate factors that survive sieving.	*/
 									res = twopmodq65_q8(p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3],k_to_try[4],k_to_try[5],k_to_try[6],k_to_try[7]);
 							  #endif
 								else {
-									ASSERT(HERE, fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!");
+									ASSERT(fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!");
 								#if USE_128x96 == 1
 									/* Use strictly  96-bit routines: */
 									res = twopmodq96_q8		(p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3],k_to_try[4],k_to_try[5],k_to_try[6],k_to_try[7], 0,tid);
@@ -3742,7 +3742,7 @@ candidate factors that survive sieving.	*/
 									q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 									if(mi64_twopmodq(p, lenP, k_to_try[l], q, lenQ, q2) != 1)
 									{
-										fprintf(stderr, "ERROR: k = %llu, post-check indicates this does not yield a factor.\n", k_to_try[l]);
+										fprintf(stderr, "ERROR: k = %" PRIu64 ", post-check indicates this does not yield a factor.\n", k_to_try[l]);
 									//	printf("Args sent to mi64_twopmodq:\n");
 									//	printf("p = %s\n", &cbuf[convert_mi64_base10_char(cbuf, p, lenP, 0)]);
 									//	printf("q = %s\n", &cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]);
@@ -3759,9 +3759,9 @@ candidate factors that survive sieving.	*/
 										if(mi64_pprimeF(q, 3ull, lenQ)) {
 											factor_k[(*nfactor)++] = k_to_try[l];
 											if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
-												sprintf(cbuf,"\n\tFactor found: q = %s = 2^(%u+2)*%llu. This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],findex,k_to_try[l]/2);
+												sprintf(cbuf,"\n\tFactor found: q = %s = 2^(%u+2)*%" PRIu64 ". This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],findex,k_to_try[l]/2);
 											else
-												sprintf(cbuf,"\n\tFactor found: q = %s = 2*p*k + 1 with k = %llu. This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],k_to_try[l]);
+												sprintf(cbuf,"\n\tFactor found: q = %s = 2*p*k + 1 with k = %" PRIu64 ". This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],k_to_try[l]);
 										#ifdef FAC_DEBUG
 											if(TRYQM1 > 1)
 												printf("factor was number %u of 0-%u in current batch.\n", l, TRYQM1);
@@ -3773,7 +3773,7 @@ candidate factors that survive sieving.	*/
 												printf("\n\tComposite Factor found: q = %s; checking if any previously-found ones divide it...\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)]);
 												for(j = 0; j < *nfactor; j++) {
 													q2[lenP] = mi64_mul_scalar( p, 2*factor_k[j], q2, lenP);
-													ASSERT(HERE, lenP == 1 && q2[lenP] == 0ull, "Unexpected carryout in known-factor computation!");
+													ASSERT(lenP == 1 && q2[lenP] == 0ull, "Unexpected carryout in known-factor computation!");
 													q2[0] += 1;	// q2 = 2.k.p + 1; No need to check for carry since 2.k.p even
 													mi64_clear(u64_arr, lenQ);	// Use u64_arr for quotient; only care if remainder == 0 or not
 													if(mi64_div(q,q2,lenQ,lenQ,u64_arr,0x0)) {
@@ -3786,9 +3786,9 @@ candidate factors that survive sieving.	*/
 														to get k2 from k and k1, use k2 = (k - k1)/f1: */
 														factor_k[*nfactor-1] = (factor_k[*nfactor-1] - factor_k[j])/q2[0];
 														if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
-															sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2^(%u+2)*%llu.\n",findex,factor_k[j]);
+															sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2^(%u+2)*%" PRIu64 ".\n",findex,factor_k[j]);
 														else
-															sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2*p*k + 1 with k = %llu.\n",factor_k[j]);
+															sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2*p*k + 1 with k = %" PRIu64 ".\n",factor_k[j]);
 													}
 													mi64_set_eq(q, u64_arr, lenQ);
 												}
@@ -3801,11 +3801,11 @@ candidate factors that survive sieving.	*/
 									#ifdef FACTOR_STANDALONE
 										fprintf(stderr,"%s", cbuf);
 									#else
-										fp = mlucas_fopen(STATFILE,"a");	ASSERT(HERE, fp != 0x0,"0");
+										fp = mlucas_fopen(STATFILE,"a");	ASSERT(fp != 0x0,"0");
 										fprintf(fp,"%s", cbuf);
 										fclose(fp); fp = 0x0;
 									#endif
-										fp = mlucas_fopen(   OFILE,"a");	ASSERT(HERE, fp != 0x0,"0");
+										fp = mlucas_fopen(   OFILE,"a");	ASSERT(fp != 0x0,"0");
 										fprintf(fp,"%s", cbuf);
 										fclose(fp); fp = 0x0;
 									#ifdef QUIT_WHEN_FACTOR_FOUND
@@ -3840,21 +3840,21 @@ candidate factors that survive sieving.	*/
 					itmp64 = mi64_mul_scalar(two_p,k_to_try[l],q,lenQ);
 					// Should only happen benignly, for q just above a wordcount boundary due to padding at high end of current sieve interval
 				//	if(itmp64)
-				//		fprintf(stderr,"2.k.p overflows for k = %llu, result = %llu*2^64 + %llu\n",k_to_try[l],itmp64,q[0]);
+				//		fprintf(stderr,"2.k.p overflows for k = %" PRIu64 ", result = %" PRIu64 "*2^64 + %" PRIu64 "\n",k_to_try[l],itmp64,q[0]);
 					q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
-				//	if(k_to_try[0] > 16300000 && k_to_try[0] < 16340000)printf("A: Trying k[%u] = %llu, q = %s\n",l,k_to_try[l],&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]);
+				//	if(k_to_try[0] > 16300000 && k_to_try[0] < 16340000)printf("A: Trying k[%u] = %" PRIu64 ", q = %s\n",l,k_to_try[l],&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]);
 				#endif
 
 				#ifdef P4WORD
 
-					ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!");
+					ASSERT(0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!");
 					q[0] += 1;	// No need to check for carry since 2.k.p even
 					t256 = twopmodq256(*(uint256*)p,*(uint256*)q);
 					res = CMPEQ256(t256, ONE256);
 
 				#elif(defined(P3WORD))
 
-					ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!");
+					ASSERT(0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!");
 					q[0] += 1;	// No need to check for carry since 2.k.p even
 					t192 = twopmodq192(*(uint192*)p,*(uint192*)q);
 					res = CMPEQ192(t192, ONE192);
@@ -3887,12 +3887,12 @@ candidate factors that survive sieving.	*/
 						#ifdef FACTOR_STANDALONE
 							fprintf(stderr,"%s", cbuf);
 						#else
-							fp = mlucas_fopen(STATFILE,"a");	ASSERT(HERE, fp != 0x0,"0");
+							fp = mlucas_fopen(STATFILE,"a");	ASSERT(fp != 0x0,"0");
 							fprintf(fp,"%s", cbuf);
 							fclose(fp); fp = 0x0;
 						#endif
 
-							fp = mlucas_fopen(   OFILE,"a");	ASSERT(HERE, fp != 0x0,"0");
+							fp = mlucas_fopen(   OFILE,"a");	ASSERT(fp != 0x0,"0");
 							fprintf(fp,"%s", cbuf);
 							fclose(fp); fp = 0x0;
 
@@ -3920,7 +3920,7 @@ candidate factors that survive sieving.	*/
 		// Every 1024th pass, write the checkpoint file, with format as described previously:
 		if(((sweep + 1) %(1024/lenQ + 1)) == 0 || ((sweep + 1) == interval_hi)) {
 			i = write_savefile(RESTARTFILE, pstring, pass, k, count);	// Only overwrite passnow, know and count fields of savefile
-			ASSERT(HERE,!i,"There were errors writing the savefile ... aborting");
+			ASSERT(!i,"There were errors writing the savefile ... aborting");
 		}	/* Successfully wrote restart file. */
 	#endif /* #if !FAC_DEBUG */
 	}
@@ -3956,12 +3956,12 @@ candidate factors that survive sieving.	*/
 			if(!fp) {
 				fprintf(stderr,"INFO: factoring savefile %s not found - will create.\n",RESTARTFILE);
 			} else {	// If file exists, it should have the proper first 2 lines:
-				itmp = fscanf(fp,"%s\n",cstr); if(itmp <= 0 || !STREQ(cstr,pstring)) ASSERT(HERE,0,"Line 1 entry found in factoring savefile does not match exponent of run.");
-				itmp = fscanf(fp,"%u\n",&i  ); if(itmp <= 0 || i != TF_PASSES      ) ASSERT(HERE,0,"Line 2 entry found in factoring savefile does not match TF_PASSES value of build.");
+				itmp = fscanf(fp,"%s\n",cstr); if(itmp <= 0 || !STREQ(cstr,pstring)) ASSERT(0,"Line 1 entry found in factoring savefile does not match exponent of run.");
+				itmp = fscanf(fp,"%u\n",&i  ); if(itmp <= 0 || i != TF_PASSES      ) ASSERT(0,"Line 2 entry found in factoring savefile does not match TF_PASSES value of build.");
 			}
 			if(!fq) {
 				fprintf(stderr,"INFO: Unable to open factoring savefile %s for reading and/or %s.tmp for writing...quitting.\n",RESTARTFILE,RESTARTFILE);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 
 			curr_line = 0;
@@ -3970,14 +3970,14 @@ candidate factors that survive sieving.	*/
 			itmp = fprintf(fq,"%s\n",pstring);
 			if(itmp <= 0) {
 				fprintf(stderr,"ERROR: unable to write Line %d (current exponent) to %s.\n", curr_line, TMPFILE);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 			/* TF_PASSES: */
 			++curr_line;
 			itmp = fprintf(fq,"%u\n",TF_PASSES);
 			if(itmp <= 0) {
 				fprintf(stderr,"ERROR: unable to write Line %d (TF_PASSES of build) to %s!\n", curr_line, TMPFILE);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 
 			// Now copy any remaining entries in existing file, modifying only the one corr. to the current pass, if it exists:
@@ -3986,14 +3986,14 @@ candidate factors that survive sieving.	*/
 					if((char_addr = strstr(cstr,"Pass ")) != 0) {
 						itmp = sscanf(char_addr,"%u",i);
 						if(itmp <= 0) {
-							fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(HERE, 0,"0");
+							fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(0,"0");
 						}
 						if(i == pass) {	// Is the pass index the one we are updating? If yes, update the k-value
-							ASSERT(HERE, !found_pass, "Multiple current-pass entry found in savefile!");
+							ASSERT(!found_pass, "Multiple current-pass entry found in savefile!");
 							found_pass = TRUE;
 							// Calculate the current k-value
 							k = (uint64)incr + (sweep+1)*(sieve_len<<6);
-							fprintf(fq,"Pass %u: %llu\n",pass,k);
+							fprintf(fq,"Pass %u: %" PRIu64 "\n",pass,k);
 						} else			// Otherwise just copy as-is
 							fputs(cstr,fq);
 					} else {	// Just copy as-is
@@ -4005,7 +4005,7 @@ candidate factors that survive sieving.	*/
 			fclose(fq); fq = 0x0;
 			if(rename(TMPFILE,RESTARTFILE)) {
 				sprintf(cstr,"ERROR: unable to rename %s file ==> %s.\n",TMPFILE,RESTARTFILE);
-				ASSERT(HERE,0,cstr);
+				ASSERT(0,cstr);
 			}
 		}	// Successfully updated restart file.
 	  #endif /* #if !FAC_DEBUG */
@@ -4035,9 +4035,9 @@ candidate factors that survive sieving.	*/
 	  #ifdef MULTITHREAD
 
 		pthread_mutex_lock(&mutex_updatecount);
-	//	printf("Thread %u locked mutex_updatecount ... Updating q-tried count: %llu + %llu = ",tid,*(targ->count),count);
+	//	printf("Thread %u locked mutex_updatecount ... Updating q-tried count: %" PRIu64 " + %" PRIu64 " = ",tid,*(targ->count),count);
 		*(targ->count) += count;
-	//	printf("%llu ... Thread %u done.\n",*(targ->count),tid);
+	//	printf("%" PRIu64 " ... Thread %u done.\n",*(targ->count),tid);
 		pthread_mutex_unlock(&mutex_updatecount);
 		return 0x0;
 	  #else
@@ -4121,7 +4121,7 @@ uint32 CHECK_PKMOD60(uint64*p, uint32 lenP, uint64 k, uint32*incr)
 			return i;
 		}
 	}
-	ASSERT(HERE, i == 16, "Expect precisely 16 valid k (mod 60) classes!");
+	ASSERT(i == 16, "Expect precisely 16 valid k (mod 60) classes!");
 //printf("\n");
 	return i;	// Nonzero return value indicates success
 }
@@ -4158,7 +4158,7 @@ uint32 CHECK_PKMOD4620(uint64*p, uint32 lenP, uint64 k, uint32*incr)
 		} else {
 			// Mersenne: For a valid p-mod, the only possible value of km are those for which k == +-1 (mod 8) [by quadratic residuacity]
 			// and for which GCD(2*km*pm + 1, 2*4620) = 1, i.e. (2*km*pm + 1) is not divisible by 3,5,7 or 11.
-		//	printf("CHECK_PKMOD4620: pm,km = %u,%u: q = %llu [mod 8 = %u]\n",pm,km,q,(uint32)q&7);
+		//	printf("CHECK_PKMOD4620: pm,km = %u,%u: q = %" PRIu64 " [mod 8 = %u]\n",pm,km,q,(uint32)q&7);
 			if(((q&7) == 1) || ((q&7) == 7)) {
 				if((q%3 == 0) || (q%5 == 0) || (q%7 == 0) || (q%11 == 0))
 					return 0;
@@ -4189,7 +4189,7 @@ uint32 CHECK_PKMOD4620(uint64*p, uint32 lenP, uint64 k, uint32*incr)
 			return i;
 		}
 	}
-	ASSERT(HERE, i == 960, "Expect precisely 960 valid k (mod 4620) classes!");
+	ASSERT(i == 960, "Expect precisely 960 valid k (mod 4620) classes!");
 	return i;	// Nonzero return value indicates success
 }
 
@@ -4209,7 +4209,7 @@ uint32 twop_mod_smallp(const int MODULUS_TYPE, const uint64*two_p, const uint32
 		r += (-((int32)r < 0)) & curr_p;
 		r += r;
 		if(r >= curr_p) { r -= curr_p; }
-	//	ASSERT(HERE, r == mi64_div_y32(two_p, curr_p, 0x0, len2P), "Fast 2p (mod q) for MMp fails!");
+	//	ASSERT(r == mi64_div_y32(two_p, curr_p, 0x0, len2P), "Fast 2p (mod q) for MMp fails!");
 	} else {
 		r = mi64_div_y32(two_p, curr_p, 0x0, len2P);
 	}
@@ -4238,7 +4238,7 @@ void	get_startval(
 	curr_p = p_last_small;
 	for(m = nclear; m < nprime; m++)
 	{
-		curr_p += (pdiff[m] << 1);	ASSERT(HERE, pprimeF(curr_p,2), "Alleged curr_p is Composite!");
+		curr_p += (pdiff[m] << 1);	ASSERT(pprimeF(curr_p,2), "Alleged curr_p is Composite!");
 		uint32 twop_mod_currp = twop_mod_smallp(MODULUS_TYPE, two_p, findex, lenQ, curr_p);	// This handles both the 1-word and multiword-exponent cases
 		// Special-handling code for p == curr_p case - this is needed to prevent 0-input  assertion in the modinv computation below.
 		// Dec 2019: Replaced (p == curr_p) with if() clause which also catches curr_p-divides-exponent for composite exponents:
@@ -4323,7 +4323,7 @@ void	get_startval(
 		if(interval_lo != 0) {
 			/* bit_len is a uint32, so use i (also a 32-bit) in place of k (64-bit) here: */
 			i = ceil(1.0*bit_len/curr_p);
-			ASSERT(HERE, i*curr_p - bit_len == curr_p - (bit_len % curr_p), "i*curr_p - bit_len == curr_p - (bit_len % curr_p)");
+			ASSERT(i*curr_p - bit_len == curr_p - (bit_len % curr_p), "i*curr_p - bit_len == curr_p - (bit_len % curr_p)");
 
 			/* Now calculate dstartval for the actual current-pass kmin value,
 			according to the number of times we'd need to run through the sieve
@@ -4337,10 +4337,10 @@ void	get_startval(
 				startval[m] = dstartval;
 
 		#ifdef FAC_DEBUG
-			ASSERT(HERE, startval     [m] < curr_p, "factor.c : startval     [m] < curr_p");
+			ASSERT(startval     [m] < curr_p, "factor.c : startval     [m] < curr_p");
 		  #if DBG_SIEVE
 			startval_incr[m] = i*curr_p - bit_len;
-			ASSERT(HERE, startval_incr[m] < curr_p, "factor.c : startval_incr[m] < curr_p");
+			ASSERT(startval_incr[m] < curr_p, "factor.c : startval_incr[m] < curr_p");
 		  #endif
 		#endif
 		}
@@ -4366,7 +4366,7 @@ uint64 given_b_get_k(double bits, const uint64 two_p[], uint32 len)
 	l = i-64;
 	k = (uint64)(pow(2.0, bits-l)/(double)itmp64);
 //	convert_uint64_base2_char(cbuf, itmp64);
-//	printf("2*p = %16llX has %u bits, lead64 = %s ==> k = %16llu.\n",itmp64,i,cbuf,k);
+//	printf("2*p = %16" PRIX64 " has %u bits, lead64 = %s ==> k = %16" PRIu64 ".\n",itmp64,i,cbuf,k);
 #endif
 	return k;
 }
@@ -4440,7 +4440,7 @@ uint64*kmin, uint64*know, uint64*kmax, uint32*passmin, uint32*passnow, uint32*pa
 			char_addr++;
 			tf_passes = convert_base10_char_uint64(char_addr);
 			if(tf_passes != TF_PASSES) {
-				++nerr; fprintf(stderr,"ERROR: Line %d of factoring restart file %s: TF_PASSES value [%llu] mismatches that of build [%u]!\n",curr_line,fname, tf_passes, (uint32)TF_PASSES);
+				++nerr; fprintf(stderr,"ERROR: Line %d of factoring restart file %s: TF_PASSES value [%" PRIu64 "] mismatches that of build [%u]!\n",curr_line,fname, tf_passes, (uint32)TF_PASSES);
 			}
 		}
 
@@ -4774,7 +4774,7 @@ int write_savefile(const char*fname, const char*pstring, uint32 passnow, uint64
 		} else if(passnow > passnow_file) {
 			/* No-op */
 		} else {
-			++nerr; fprintf(stderr,"ERROR: In factoring restart file %s: compared to previous checkpoint, passnow[%u] should be same as file[%u] and know[%llu] greater than file[%llu], or passnow should be greater!\n",fname,passnow,passnow_file,know,know_file);
+			++nerr; fprintf(stderr,"ERROR: In factoring restart file %s: compared to previous checkpoint, passnow[%u] should be same as file[%u] and know[%" PRIu64 "] greater than file[%" PRIu64 "], or passnow should be greater!\n",fname,passnow,passnow_file,know,know_file);
 		}
 
 		/* Line 10: passmax: */
diff --git a/src/factor_test.h b/src/factor_test.h
index f8d1fb94..9193f13a 100755
--- a/src/factor_test.h
+++ b/src/factor_test.h
@@ -128,7 +128,7 @@ int test_fac()
 	/* TRYQ: */
 	#ifndef TRYQ
 		/* This flag is required: */
-		ASSERT(HERE, 0,"TRYQ not defined!");
+		ASSERT(0,"TRYQ not defined!");
 	#else
 		i = TRYQ;
 		printf("TRYQ = %u\n", i);
@@ -143,7 +143,7 @@ int test_fac()
 		i = THREE_OP128;
 		printf("THREE_OP128 = %u\n", i);
 		/* iF NONZERO, Must = 1 : */
-		ASSERT(HERE, (THREE_OP128 == 1),"THREE_OP128 Must = 0 or 1!");
+		ASSERT((THREE_OP128 == 1),"THREE_OP128 Must = 0 or 1!");
 		/* Only relevant for TRYQ = 4 or 8: */
 		#if(TRYQ != 4 && TRYQ != 8)
 			#error	THREE_OP128 Only relevant for TRYQ = 4 or 8!
@@ -161,7 +161,7 @@ int test_fac()
 	/* NUM_SIEVING_PRIME: */
 	#ifndef NUM_SIEVING_PRIME
 		/* This flag is required: */
-		ASSERT(HERE, 0,"NUM_SIEVING_PRIME not defined!");
+		ASSERT(0,"NUM_SIEVING_PRIME not defined!");
 	#else
 		i = NUM_SIEVING_PRIME;
 		printf("NUM_SIEVING_PRIME = %u\n", i);
@@ -170,7 +170,7 @@ int test_fac()
 	/* TF_CLASSES: */
 	#ifndef TF_CLASSES
 		/* This flag is required: */
-		ASSERT(HERE, 0,"TF_CLASSES not defined!");
+		ASSERT(0,"TF_CLASSES not defined!");
 	#else
 		i = TF_CLASSES;
 		printf("TF_CLASSES = %u\n", i);
@@ -253,7 +253,7 @@ int test_fac()
 	#else
 		i = USE_128x96;
 		printf("USE_128x96 = %u\n", i);
-		ASSERT(HERE,i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n");
+		ASSERT(i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n");
 		/* Only relevant for factoring up to 128 bits: */
 		#if(defined(P3WORD) || defined(P4WORD))
 			#warning USE_128x96 Only relevant for factoring up to 128 bits!
@@ -270,7 +270,7 @@ int test_fac()
 		#else
 			i = USE_128x96;
 			printf("    USE_128x96 = %u\n", i);
-			ASSERT(HERE,i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n");
+			ASSERT(i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n");
 		#endif
 
 	#endif
@@ -310,17 +310,17 @@ int test_fac()
 	q     = (uint64 *)calloc(l, sizeof(uint64));
 	q2    = (uint64 *)calloc(l, sizeof(uint64));
 	mi64_nega(q,q,l);
-	ASSERT(HERE, mi64_iszero(q,l), "mi64 -0 == 0 check fails!");
+	ASSERT(mi64_iszero(q,l), "mi64 -0 == 0 check fails!");
 	q[0] = 1;	mi64_nega(q,q,l);
 	mi64_add_scalar(q,1,q,l);
-	ASSERT(HERE, mi64_iszero(q,l), "mi64 -1 + 1 == 0 check fails!");
+	ASSERT(mi64_iszero(q,l), "mi64 -1 + 1 == 0 check fails!");
 
 	// Sep 2015 Bugfix: Hit case with len = 3 and these addends, which give a ripple carry into the top word:
 	q[0] =  6216518070457578443ull;	q2[0] = 12230226003251973173ull;
 	q[1] = 16881888488052985758ull;	q2[1] =  1564855585656565857ull;
 	q[2] =       65307107850795ull;	q2[2] =           2051081684ull;
 	mi64_add(q,q2,q,3);
-	ASSERT(HERE, q[0] == 0ull && q[1] == 0ull && q[2] == 65309158932480ull, "Sep 2015 mi64_add bugfix test fails!");
+	ASSERT(q[0] == 0ull && q[1] == 0ull && q[2] == 65309158932480ull, "Sep 2015 mi64_add bugfix test fails!");
 
 	/* Init the RNG: */
 	rng_isaac_init(TRUE);
@@ -332,7 +332,7 @@ int test_fac()
 	mi64_nega(q,q,l);
 	mi64_negl(q2,q2,l);
 	mi64_add_scalar(q2,1,q2,l);
-	ASSERT(HERE, mi64_cmp_eq(q,q2,l), "mi64 -q == ~q+1 check fails!");
+	ASSERT(mi64_cmp_eq(q,q2,l), "mi64 -q == ~q+1 check fails!");
 	free((void*)q);	free((void*)q2);
 	q = q2 = 0x0;
 
@@ -350,7 +350,7 @@ int test_fac()
 	k = 7143819210136784550ull;	p64 = 127;
 	p192.d0 = 2294959606785646778ull; p192.d1 = 10167084567166165345ull; p192.d2 = 2688959234133783535ull;
 	mi64_mul_vector_hi_qmmp((uint64*)&p192, p64, k, (uint64*)&q192, 192);
-	ASSERT(HERE, q192.d0 == 141525868296128525ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qmmp test fails!");
+	ASSERT(q192.d0 == 141525868296128525ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qmmp test fails!");
 
 	/* 09/30/2015: Adapt above to test Fermat-factor analog of above, mi64_mul_vector_hi_qferm:
 	Ex.: q = 2.k.2^128 + 1; k = 3571909605068392275, i.e.
@@ -366,7 +366,7 @@ int test_fac()
 	k = 3571909605068392275ull;	p64 = 128;
 	p192.d0 = 2294959606785646778ull; p192.d1 = 10167084567166165345ull; p192.d2 = 2688959234133783535ull;
 	mi64_mul_vector_hi_qferm((uint64*)&p192, p64, k, (uint64*)&q192, 192);
-	ASSERT(HERE, q192.d0 == 2224217378008898426ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qferm test fails!");
+	ASSERT(q192.d0 == 2224217378008898426ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qferm test fails!");
 
 	// Apr 2015: mi64_div bug debug - 0-pad both inputs to yield a length-4 mi64 array:
 	// Use 2^256 as a template for our 0-padding, but use 1 less leading 0 because convert_base10_char_mi64
@@ -374,14 +374,14 @@ int test_fac()
 	//                    2^256 = 115792089237316195423570985008687907853269984665640564039457584007913129639936:
 	// Feb 2020: Chnages to length-setting logic in convert_base10_char_mi64 mean we must init i,j = 0 prior to calling that function:
 	i = 0; p = convert_base10_char_mi64( "00000000000000000000000000000000000000364131549958466711308970009901738230041", &i);
-	ASSERT(HERE, mi64_getlen(p, i) == 3 && i == 4,"Bad p-length(s) in Apr2015 mi64_div test!");
+	ASSERT(mi64_getlen(p, i) == 3 && i == 4,"Bad p-length(s) in Apr2015 mi64_div test!");
 	j = 0; q = convert_base10_char_mi64( "00000000000000000000000000000000000000000000000000000000019437941122649628431", &j);
-	ASSERT(HERE, mi64_getlen(q, j) == 2 && j == 4,"Bad q-length(s) in Apr2015 mi64_div test!");
+	ASSERT(mi64_getlen(q, j) == 2 && j == 4,"Bad q-length(s) in Apr2015 mi64_div test!");
 	q2      = (uint64 *)calloc(4, sizeof(uint64));	// for quotient
 	u64_arr = (uint64 *)calloc(4, sizeof(uint64));	// for remainder
 	mi64_div(p,q,i,i,q2,u64_arr);
-	ASSERT(HERE, mi64_getlen(     q2, i) == 2 && q2[1] == 1 && q2[0] ==   286286737571717471ull, "bad quotient!");
-	ASSERT(HERE, mi64_getlen(u64_arr, i) == 1 &&          u64_arr[0] ==   618006351061617544ull, "bad remainder!");
+	ASSERT(mi64_getlen(     q2, i) == 2 && q2[1] == 1 && q2[0] ==   286286737571717471ull, "bad quotient!");
+	ASSERT(mi64_getlen(u64_arr, i) == 1 &&          u64_arr[0] ==   618006351061617544ull, "bad remainder!");
 	fprintf(stderr,"Apr2015 mi64_div quicktest passes.\n");
 	free((void*)p); free((void*)q); free((void*)q2); free((void*)u64_arr);
 	p = 0x0; q = 0x0; q2 = 0x0; u64_arr = 0x0;
@@ -391,18 +391,18 @@ int test_fac()
 	two_p = (uint64 *)calloc(i, sizeof(uint64));
 	mi64_add(p,p,two_p,i);
 	j = 0; q = convert_base10_char_mi64("4969289881134175801642878989330437804491760137935869781219375395913301677808943323410612629818326630668131744420258226244511522022525093242408710254941677603671849301746980479735516135243111", &j);
-	ASSERT(HERE, i==j,"0");
+	ASSERT(i==j,"0");
 	q2      = (uint64 *)calloc(i, sizeof(uint64));
 	u64_arr = (uint64 *)calloc(i, sizeof(uint64));
 	mi64_div(q,two_p,i,i,q2,u64_arr);
-	ASSERT(HERE, mi64_getlen(q2, i) == 1 , "k must be 64-bit!");
-	ASSERT(HERE, q2[0] == 4677965, "k != expected value of 9355930!");
+	ASSERT(mi64_getlen(q2, i) == 1 , "k must be 64-bit!");
+	ASSERT(q2[0] == 4677965, "k != expected value of 9355930!");
 	if(!mi64_cmp_eq_scalar(u64_arr, 1ull, i)) {		// Remainder = 1
 		fprintf(stderr,"ERROR : (p, q) = ( %s, %s ) : q mod (2p) = %s != 1!\n",
 					&cbuf0[convert_mi64_base10_char(cbuf0, p, i, 0)],
 					&cbuf1[convert_mi64_base10_char(cbuf1, q, i, 0)],
 					&cbuf2[convert_mi64_base10_char(cbuf2, u64_arr, i, 0)]);
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	} else {
 		fprintf(stderr,"mi64_div quicktest passes.\n");
 	}
@@ -423,11 +423,11 @@ int test_fac()
 	p[0] = 1;	mi64_shl(p,p,j,i);	// 2^n
 	mi64_sub_scalar(p,1,p,i);	// p = 2^n - 1;
 	convert_mi64_base10_char(cbuf0, p, i, 0);
-	ASSERT(HERE, STREQ(cbuf0, "170141183460469231731687303715884105727"), "M127 string-conversion test failed!");
+	ASSERT(STREQ(cbuf0, "170141183460469231731687303715884105727"), "M127 string-conversion test failed!");
 	mi64_set_eq    (q, p, i);
 	mi64_sub_scalar(q ,1ull,q ,i);	// q = p-1
 	j = mi64_twopmodq(q, i, 0, p, i, 0x0);
-	ASSERT(HERE, j == 1, "M127 base-2 PRP test failed!");
+	ASSERT(j == 1, "M127 base-2 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-2 PRP test of M127 passed: Time =%s\n",get_time_str(tdiff));
 
@@ -445,13 +445,13 @@ int test_fac()
 	q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 	if(mi64_twopmodq(p, lenP, 56474845800ull, q, lenQ, q2) != 1) {
 		printf("ERROR: res = %s != 1\n", &cbuf[convert_mi64_base10_char(cbuf0, q2, lenQ, 0)]);
-		ASSERT(HERE, 0, "MM31 known-factor (k = 56474845800) test failed!");
+		ASSERT(0, "MM31 known-factor (k = 56474845800) test failed!");
 	}
 	q[1] = mi64_mul_scalar( p, 2*41448832329225ull, q, lenP);
 	q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 	if(mi64_twopmodq(p, lenP, 41448832329225ull, q, lenQ, q2) != 1) {
 		printf("ERROR: res = %s != 1\n", &cbuf[convert_mi64_base10_char(cbuf0, q2, lenQ, 0)]);
-		ASSERT(HERE, 0, "MM31 known-factor (k = 41448832329225) test failed!");
+		ASSERT(0, "MM31 known-factor (k = 41448832329225) test failed!");
 	}
 	free((void*)p);	free((void*)q);	free((void*)q2);	p = q = q2 = 0x0;
 
@@ -463,18 +463,18 @@ int test_fac()
 	p[0] = 1;	mi64_shl(p,p,j,i);	// 2^n
 	mi64_sub_scalar(p,1,p,i);	// p = 2^n - 1;
 	convert_mi64_base10_char(cbuf0, p, i, 0);
-	ASSERT(HERE, STREQ(cbuf0, "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127"), "M607 string-conversion test failed!");
+	ASSERT(STREQ(cbuf0, "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127"), "M607 string-conversion test failed!");
 	mi64_set_eq    (q, p, i);
 	mi64_sub_scalar(q ,1ull,q ,i);	// q = p-1
 	clock1 = clock();
 	j = mi64_twopmodq(q, i, 0, p, i, 0x0);
-	ASSERT(HERE, j == 1, "M607 base-2 PRP test failed!");
+	ASSERT(j == 1, "M607 base-2 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-2 PRP test of M607 passed: Time =%s\n",get_time_str(tdiff));
 	// Try the general-base PRP routine on the same number:
 	clock1 = clock();
 	j = mi64_pprimeF(p, 3, i);
-	ASSERT(HERE, j == 1, "M607 base-3 PRP test failed!");
+	ASSERT(j == 1, "M607 base-3 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-3 PRP test of M607 passed: Time =%s\n",get_time_str(tdiff));
 	free((void*)p);	free((void*)q);	p = q = 0x0;
@@ -490,13 +490,13 @@ int test_fac()
 	mi64_sub_scalar(q ,1ull,q ,i);	// q = p-1
 	clock1 = clock();
 	j = mi64_twopmodq(q, i, 0, p, i, 0x0);
-	ASSERT(HERE, j == 1, "M4423 base-2 PRP test failed!");
+	ASSERT(j == 1, "M4423 base-2 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-2 PRP test of M4423 passed: Time =%s\n",get_time_str(tdiff));
 	// Try the general-base PRP routine on the same number:
 	clock1 = clock();
 	j = mi64_pprimeF(p, 3, i);
-	ASSERT(HERE, j == 1, "M4423 base-3 PRP test failed!");
+	ASSERT(j == 1, "M4423 base-3 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-3 PRP test of M4423 passed: Time =%s\n",get_time_str(tdiff));
 	free((void*)p);	free((void*)q);	p = q = 0x0;
@@ -509,18 +509,18 @@ int test_fac()
 	q     = (uint64 *)calloc(i, sizeof(uint64));
 	p[0] = 1;	mi64_shl(p,p,j,i);	// 2^n
 	mi64_sub_scalar(p,1,p,i);	// p = 2^n - 1; next we p /= 458072843161 :
-ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/458072843161 divisibility test fails!");
+ASSERT(0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/458072843161 divisibility test fails!");
 	mi64_set_eq    (q, p, i);
 	mi64_sub_scalar(q ,1ull,q ,i);	// q = p-1
 	clock1 = clock();
 	j = mi64_twopmodq(q, i, 0, p, i, 0x0);
-	ASSERT(HERE, j == 1, "M7331 cofactor base-2 PRP test failed!");
+	ASSERT(j == 1, "M7331 cofactor base-2 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-2 PRP test of M7331 cofactor passed: Time =%s\n",get_time_str(tdiff));
 	// Try the general-base PRP routine on the same number:
 	clock1 = clock();
 	j = mi64_pprimeF(p, 3, i);
-	ASSERT(HERE, j == 1, "M7331 cofactor base-3 PRP test failed!");
+	ASSERT(j == 1, "M7331 cofactor base-3 PRP test failed!");
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-3 PRP test of M7331 cofactor passed: Time =%s\n",get_time_str(tdiff));
 	free((void*)p);	free((void*)q);	p = q = 0x0;
@@ -538,7 +538,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	mi64_sub_scalar(q ,1ull,q ,i);	// q = p-1
 	clock1 = clock();
 	j = mi64_twopmodq(q, i, 0, p, i, 0x0);
-	ASSERT(HERE, j == 1, "M11213 base-2 PRP test failed!");
+	ASSERT(j == 1, "M11213 base-2 PRP test failed!");
 	free((void*)p);	free((void*)q);	p = q = 0x0;
 	clock2 = clock();	tdiff = (double)(clock2 - clock1);	clock1 = clock2;
 	printf	("Base-2 PRP test of M11213 passed: Time =%s\n",get_time_str(tdiff));
@@ -557,12 +557,12 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	q[0] = 1;	mi64_shl(q,q,j,i);	// 2^607
 	mi64_sub_scalar(q,1,q,i);		// p = 2^607 - 1;
 	// Mul by any scalar < 2^33 should have no carry out of the 10th 64-bit word
-	ASSERT(HERE, 0 == mi64_mul_scalar(q,2*28115877,q,i), "2.k.M607 (k = 28115877) illegal carryout on scalar-mul!");
+	ASSERT(0 == mi64_mul_scalar(q,2*28115877,q,i), "2.k.M607 (k = 28115877) illegal carryout on scalar-mul!");
 	mi64_set_eq    (q2, q, i);		// q2 = q-1
 	mi64_add_scalar(q ,1ull,q ,i);	// q = 2.k.p + 1
 	convert_mi64_base10_char(cbuf0, q, i, 0);
-	ASSERT(HERE, STREQ(cbuf0, "29866820952126214568806646392159603944715357116119498255498035716027095678819717544056871993402815945328710228895559628455719074056369970920495232704087963394016941839123205985860254232344759"), "q = 2.k.M607+1 (k = 28115877) string-conversion test failed!");
-	ASSERT(HERE, mi64_twopmodq(q2, i, 0, q, i, 0x0) == 1, "q = 2.k.M607+1 (k = 28115877) base-2 PRP test failed!");
+	ASSERT(STREQ(cbuf0, "29866820952126214568806646392159603944715357116119498255498035716027095678819717544056871993402815945328710228895559628455719074056369970920495232704087963394016941839123205985860254232344759"), "q = 2.k.M607+1 (k = 28115877) string-conversion test failed!");
+	ASSERT(mi64_twopmodq(q2, i, 0, q, i, 0x0) == 1, "q = 2.k.M607+1 (k = 28115877) base-2 PRP test failed!");
 	free((void*)q);	free((void*)q2);
 	q = q2 = 0x0;
 
@@ -571,11 +571,11 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	*/
 	// 2nd multiplicand is just leading digits of Pi, sans decimal point:
 	j = 0; q2 = convert_base10_char_mi64("3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233786783165271201909145648566923460348610454326648213393607260249141273724587006606315588174881520920962829254091715364367892590360011330530548820466521", &j);
-	ASSERT(HERE, j == 20, "vector lengths should be 20!");
+	ASSERT(j == 20, "vector lengths should be 20!");
 	q     = (uint64 *)calloc(j, sizeof(uint64));	// output array
 	mi64_mul_vector_hi_qmmp(q2, 1231, 60773088284ull, q, (j<<6));	// q = 2.k.M(p) + 1 with k = 60773088284
 	convert_mi64_base10_char(cbuf0, q, j, 0);
-	ASSERT(HERE, STREQ(cbuf0, "678299328487875406787553667584424766193319571425229812042632483796223090743976740829512533956144441574815272835626612961160454952708658437402700559999225654073147100413573556498251710301510338504761109128343850675314104893353603303495634850631971760134667616782442458276408663375682004856646999060481786800862572039635523841600325205075025327991817191734342347965082117753555537"), "mi64_mul_vector_hi_qmmp test failed!");
+	ASSERT(STREQ(cbuf0, "678299328487875406787553667584424766193319571425229812042632483796223090743976740829512533956144441574815272835626612961160454952708658437402700559999225654073147100413573556498251710301510338504761109128343850675314104893353603303495634850631971760134667616782442458276408663375682004856646999060481786800862572039635523841600325205075025327991817191734342347965082117753555537"), "mi64_mul_vector_hi_qmmp test failed!");
 	free((void*)q);	free((void*)q2);
 	q = q2 = 0x0;
 
@@ -583,14 +583,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	q192.d2=506560280167ull; q192.d1=18446744073709551615ull; q192.d0=18446743060588991281ull;
 	p192 = q192; p192.d0 -= 1;
 	x192 = twopmodq192(p192,q192);
-	ASSERT(HERE, CMPEQ192(x192, ONE192),"Bad twopmodq192 output");
+	ASSERT(CMPEQ192(x192, ONE192),"Bad twopmodq192 output");
 
 #if 0
 	/* 12/23/2008: Use this to help debug the mi64 powering routine: */
 	j = mi64_twopmodq(&p192.d0, 3, 0, &q192.d0, 3, 0x0);
 	if(j != 1) {
 		printf("12/23/2008 mi64_twopmodq Test failed!\n");
-	//	ASSERT(HERE, j == 1, "mi64_twopmodq != 1");
+	//	ASSERT(j == 1, "mi64_twopmodq != 1");
 	//	exit(0);
 	}
 #endif
@@ -615,8 +615,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	MULH192_TRUNC(p192,q192,0ull,y192);	// Expected value of 64-bit carry layer at top of low-half product = 0
 	/* Reference value to compare to: */
 	q192.d2=                  11ull; q192.d1=  320947345442520101ull; q192.d0= 2846153632803221902ull;
-	ASSERT(HERE, CMPEQ192(x192, q192),"MULH192       fails!");
-	ASSERT(HERE, CMPEQ192(y192, q192),"MULH192_TRUNC fails!");
+	ASSERT(CMPEQ192(x192, q192),"MULH192       fails!");
+	ASSERT(CMPEQ192(y192, q192),"MULH192_TRUNC fails!");
 
 	/* Count the # of test q's of the various sizes: */
 	for(ntest63    = 0; fac63   [ntest63   ].p          != 0; ++ntest63   ){}
@@ -641,23 +641,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		// In the Fermat cae we let 2^n play the role of the Mersenne exponent p and generalize from there.
 		p64 = 1ull << ffac64[i].p; k = ffac64[i].q << 1;	// Factors of Fn have form q = k.2^(n+2) + 1; n stored in .p, k in .q
 		q64 = 2*k*p64 + 1;	// p64 now stores 2^n
-		ASSERT(HERE, q64%(p64<<2)==1, "test_fac : q64 % 2^(n+2) != 1 !");
+		ASSERT(q64%(p64<<2)==1, "test_fac : q64 % 2^(n+2) != 1 !");
 		pm60 = p64%60;
 		km60 = k  %60;
 		if(!CHECK_PKMOD60(&p64,1, km60, 0x0)) {
-			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60);
+			ASSERT(0,"0");
 		}
 		pm60 = p64%4620;
 		km60 = k  %4620;
 		if(!CHECK_PKMOD4620(&p64,1, km60, 0x0)) {
-			fprintf(stderr,"Illegal (p,k) mod 4620 pair: p,p mod 4620, k,k mod 4620 = %llu %4u %llu %4u\n",p64,pm60,k,km60);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"Illegal (p,k) mod 4620 pair: p,p mod 4620, k,k mod 4620 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq64(p64, q64);
 		if(res64 != q64-1ull) {	// Nov 2021: fiddled twopmodq64() to return true-mod
-			fprintf(stderr,"ERROR: twopmodq64(F%u, k = %llu) returns non-unity result %u\n",(uint32)ffac64[i].p,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq64(F%u, k = %" PRIu64 ") returns non-unity result %u\n",(uint32)ffac64[i].p,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	}
 
@@ -678,7 +678,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 			fprintf(stderr,"ERROR: twopmodq128(F%u, %s ) returns non-unity result %s\n",(uint32)p64,
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, res128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -699,7 +699,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 			fprintf(stderr,"ERROR: twopmodq192(F%u, %s ) returns non-unity result %s\n",(uint32)p64,
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, res192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -720,7 +720,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 			fprintf(stderr,"ERROR: twopmodq256(F%u, %s ) returns non-unity result %s\n",(uint32)p64,
 					&cbuf1[convert_uint256_base10_char(cbuf1, q256)],
 					&cbuf2[convert_uint256_base10_char(cbuf2, res256)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -736,22 +736,22 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		j =         ffacBig[i].p;	// Fermat index n
 		if(j > 1000) break;			// Tune this as desired to skip larger time-consuming cases
 		l = (uint32)ffacBig[i].d1;	// Power of 2 appearing in factor q = k*2^l + 1
-		ASSERT(HERE, l >= (j+2), "Power of 2 appearing in factor of Fn must be >= [n+2]!");
+		ASSERT(l >= (j+2), "Power of 2 appearing in factor of Fn must be >= [n+2]!");
 		k =         ffacBig[i].d0;	// Factor k; must be odd in this schema
-		ASSERT(HERE, 1ull == (k & 1ull), "k must be odd!");
+		ASSERT(1ull == (k & 1ull), "k must be odd!");
 		lenP = (j+63)>>6;	// Assume Fermat index increases as we traverse ffacBig array, thus this overwrites previous
 		p[0] = 1ull;	p[lenP] = mi64_shl(p,p,j,lenP);	lenP += (p[lenP] != 0ull);	// case's p = (1 << j) array elements.
 		lenQ = (l+63)>>6;
 		q[0] = k;		q[lenQ] = mi64_shl(q,q,l,lenQ);	lenQ += (q[lenQ] != 0ull);
 		q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
-	//printf("Testing F%u, q = %llu * 2^%u + 1, lenQ = %u...\n",j,k,l,lenQ);
+	//printf("Testing F%u, q = %" PRIu64 " * 2^%u + 1, lenQ = %u...\n",j,k,l,lenQ);
 		uint32 res1 = mi64_twopmodq(p, lenP, k << (l-j-1), q, lenQ, q2);	// Fiddle k to put q in Mersenne-like form = 2.k'.2^j + 1
 			//	res1 = mi64_twopmodq_qferm(j, k << (l-j), q2);
 		if(res1 != 1) {
-			fprintf(stderr,"ERROR: mi64_twopmodq(F%u, q = %llu * 2^%u + 1 = %s) returns non-unity result %s\n",j,k,l,
+			fprintf(stderr,"ERROR: mi64_twopmodq(F%u, q = %" PRIu64 " * 2^%u + 1 = %s) returns non-unity result %s\n",j,k,l,
 					&cbuf1[convert_mi64_base10_char(cbuf1, q, lenQ, 0)],
 					&cbuf2[convert_mi64_base10_char(cbuf2,q2, lenQ, 0)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -763,8 +763,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	{
 		p64 = fac63[i].p; q64 = fac63[i].q;
 		/* Make sure the MSB = 0: */
-		ASSERT(HERE, ( int64)p64 > 0, "test_fac : ( int64)p64 > 0");
-		ASSERT(HERE, q64%(2*p64) ==1, "test_fac : q64%(2*p64) ==1");
+		ASSERT(( int64)p64 > 0, "test_fac : ( int64)p64 > 0");
+		ASSERT(q64%(2*p64) ==1, "test_fac : q64%(2*p64) ==1");
 		k = (q64-1)/(2*p64);	for(j = 0; j < 64; j++) { karr[j] = k; }
 		pm60 = p64%60;
 		km60 = k  %60;
@@ -776,19 +776,19 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		compositeness test as an exponent filter: */
 		if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0))
 		{
-			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60);
+			ASSERT(0,"0");
 		}
 
 		if((res64 = twopmodq63(p64, q64)) != 1ull)
 		{
-			fprintf(stderr,"ERROR: twopmodq63(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq63(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		if((res64 = twopmodq64(p64, q64)) != 1ull)
 		{
-			fprintf(stderr,"ERROR: twopmodq64(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq64(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 
 	#ifdef USE_FLOAT
@@ -796,16 +796,16 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq78_3WORD_DOUBLE(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	/* this is currently sse2/msvc only :
 		p192.d0 = p64; p192.d1 = p192.d2 = 0;
 		x256 = twopmodq200_8WORD_DOUBLE((uint64*)&p192, k);	res64 = !x256.d3 && (uint64)CMPEQ192(x256, ONE192);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq200_8WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq200_8WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	*/
 	#endif
@@ -813,15 +813,15 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res96 = twopmodq96(p64, k);
 		if(!CMPEQ96(ONE96,res96))
 		{
-			fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k,
+			fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k,
 					&cbuf2[convert_uint96_base10_char(cbuf2, res96)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128_96(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
+			fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
 		}
 
 	#ifdef USE_FMADD
@@ -829,8 +829,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 
@@ -840,8 +840,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq78_3WORD_DOUBLE_q2(p64,k,k, 0,0);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	  #ifdef USE_FMADD
@@ -849,23 +849,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE_q2(p64,k,k);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2( %llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2( %" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	#elif(TRYQ == 4)
 		res64 = twopmodq63_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq63_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq63_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #ifdef USE_FLOAT
 		res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	  #ifdef USE_FMADD
@@ -873,57 +873,57 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 
 		res64 = twopmodq96_q4(p64,k,k,k,k, 0,0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq96_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq96_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq128_96_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#elif(TRYQ == 8)
 		res64 = twopmodq63_q8(p64,k,k,k,k,k,k,k,k);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq63_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq63_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #if defined(USE_FLOAT) && defined(USE_SSE2) && (OS_BITS == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 		res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq96_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq96_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#elif(TRYQ == 16)
 	  #if defined(USE_FLOAT) && defined(USE_AVX)&& defined(COMPILER_TYPE_GCC) && (OS_BITS == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0);
 		if(res64 != 0xffff)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #else
 		#error (TRYQ == 16) only supported for 64-bit/P1WORD/GCC/AVX builds!
@@ -931,14 +931,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	#elif(TRYQ >= 32)
 		res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0);
 		if(res64 != 0xffffffff) {
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 //	#elif(TRYQ == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0);
 		if(res64 != 0xffffffffffffffff) {
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 	}
@@ -951,7 +951,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	{
 		p64 = fac64[i].p; q64 = fac64[i].q;
 
-		ASSERT(HERE, q64%(2*p64)==1, "test_fac : q64%(2*p64)==1");
+		ASSERT(q64%(2*p64)==1, "test_fac : q64%(2*p64)==1");
 
 		k = (q64-1)/(2*p64);	for(j = 0; j < 64; j++) { karr[j] = k; }
 		pm60 = p64%60;
@@ -964,20 +964,20 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		compositeness test as an exponent filter: */
 		if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0))
 		{
-			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60);
+			ASSERT(0,"0");
 		}
 
 		if(q64%(2*p64) != 1)
 		{
-			fprintf(stderr,"ERROR : (p, q) = ( %llu, %llu ) : q mod (2p) = %llu != 1!\n",p64,q64, q64%(2*p64));
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR : (p, q) = ( %" PRIu64 ", %" PRIu64 " ) : q mod (2p) = %" PRIu64 " != 1!\n",p64,q64, q64%(2*p64));
+			ASSERT(0,"0");
 		}
 
 		if((res64 = twopmodq64(p64, q64)) != 1ull)
 		{
-			fprintf(stderr,"ERROR: twopmodq64(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq64(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 
 	#ifdef USE_FLOAT
@@ -985,24 +985,24 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq78_3WORD_DOUBLE(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 
 		res96 = twopmodq96(p64, k);
 		if(!CMPEQ96(ONE96,res96))
 		{
-			fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k,
+			fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k,
 					&cbuf2[convert_uint96_base10_char(cbuf2, res96)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128_96(p64,k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 
 	#ifdef USE_FMADD
@@ -1010,8 +1010,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE(p64,k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 
 	#endif
@@ -1022,8 +1022,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq78_3WORD_DOUBLE_q2(p64, k,k, 0,0);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	  #ifdef USE_FMADD
@@ -1031,23 +1031,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE_q2(p64, k,k);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	#elif(TRYQ == 4)
 		res64 = twopmodq64_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq64_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq64_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #ifdef USE_FLOAT
 		res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	  #ifdef USE_FMADD
@@ -1055,56 +1055,56 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 		res64 = twopmodq96_q4(p64,k,k,k,k, 0,0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq128_96_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#elif(TRYQ == 8)
 		res64 = twopmodq64_q8(p64,k,k,k,k,k,k,k,k);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq64_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq64_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #if defined(USE_FLOAT) && defined(USE_SSE2) && (OS_BITS == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%llu, k = %llu x 4 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 		res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#elif(TRYQ == 16)
 	  #if defined(USE_FLOAT) && defined(USE_AVX)&& defined(COMPILER_TYPE_GCC) && (OS_BITS == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0);
 		if(res64 != 0xffff)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #else
 		#error (TRYQ == 16) only supported for 64-bit/P1WORD/GCC/AVX builds!
@@ -1112,14 +1112,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	#elif(TRYQ >= 32)
 		res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0);
 		if(res64 != 0xffffffff) {
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 //	#elif(TRYQ == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0);
 		if(res64 != 0xffffffffffffffff) {
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 	}
@@ -1134,7 +1134,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		q128.d1 = (uint64)1; q128.d0 = q64;
 
 		/* Modify this so it'll work with 65-bit q's: */
-		ASSERT(HERE, ((q64-1)/2 + 0x8000000000000000ull)%p64==0, "test_fac : ((q64-1)/2 + 0x8000000000000000ull)%p64==0");
+		ASSERT(((q64-1)/2 + 0x8000000000000000ull)%p64==0, "test_fac : ((q64-1)/2 + 0x8000000000000000ull)%p64==0");
 
 		k = ((q64-1)/2 + 0x8000000000000000ull)/p64;	for(j = 0; j < 64; j++) { karr[j] = k; }
 		pm60 = p64%60;
@@ -1144,37 +1144,37 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		compositeness test as an exponent filter: */
 		if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0))
 		{
-			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60);
+			ASSERT(0,"0");
 		}
 		if((res64 = twopmodq65(p64,k)) != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq65(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq65(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 
 	#ifdef USE_FLOAT
 		res64 = twopmodq78_3WORD_DOUBLE(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 
 		res96 = twopmodq96(p64, k);
 		if(!CMPEQ96(ONE96,res96))
 		{
-			fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k,
+			fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k,
 					&cbuf2[convert_uint96_base10_char(cbuf2, res96)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128_96(p64,k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 
 	#ifdef USE_FMADD
@@ -1182,8 +1182,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 
@@ -1193,8 +1193,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq78_3WORD_DOUBLE_q2(p64, k,k, 0,0);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	  #ifdef USE_FMADD
@@ -1202,23 +1202,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE_q2(p64, k,k);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	#elif(TRYQ == 4)
 		res64 = twopmodq65_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq65_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq65_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #ifdef USE_FLOAT
 		res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 	  #ifdef USE_FMADD
@@ -1226,56 +1226,56 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 		res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 		res64 = twopmodq96_q4(p64,k,k,k,k, 0,0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq128_96_q4(p64,k,k,k,k);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#elif(TRYQ == 8)
 		res64 = twopmodq65_q8(p64,k,k,k,k,k,k,k,k);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq65_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq65_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #if defined(USE_FLOAT) && defined(USE_SSE2) && (OS_BITS == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%llu, k = %llu x 4 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 		res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 		res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#elif(TRYQ == 16)
 	  #if defined(USE_FLOAT) && defined(USE_AVX)&& defined(COMPILER_TYPE_GCC) && (OS_BITS == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0);
 		if(res64 != 0xffff)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #else
 		#error (TRYQ == 16) only supported for 64-bit/P1WORD/GCC/AVX builds!
@@ -1283,14 +1283,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 	#elif(TRYQ >= 32)
 		res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0);
 		if(res64 != 0xffffffff) {
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 //	#elif(TRYQ == 64)
 		res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0);
 		if(res64 != 0xffffffffffffffff) {
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 	}
@@ -1310,7 +1310,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728
 					&cbuf0[convert_uint64_base10_char (cbuf0, p64)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint64_base10_char (cbuf2, res64)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 		q128.d0 += 1ull;
 
@@ -1348,16 +1348,16 @@ if((q128.d1 >> 14) == 0) {
 	dbl = (double)q96.d0 + (double)q96.d1*TWO64FLOAT;
 	rnd = log(dbl)/log(2.0);
 	if(rnd > 77)
-		printf("p = %10llu, p,k (mod 60) = %2u, %2u, lg(q) = %10.5f\n",p64,pm60,km60,rnd);
+		printf("p = %10" PRIu64 ", p,k (mod 60) = %2u, %2u, lg(q) = %10.5f\n",p64,pm60,km60,rnd);
 }
 */
 		/* This property only applies for prime exponents, so use a quick base-2 Fermat
 		compositeness test as an exponent filter: */
 		if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0))
 		{
-			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %s %4u\n",p64,pm60,
+			fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %s %4u\n",p64,pm60,
 					&cbuf1[convert_uint128_base10_char(cbuf1, x128)],km60);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 	/* Here use full 96-bit q in both floating and 96-bit modmul, so compute for both: */
@@ -1369,17 +1369,17 @@ if((q128.d1 >> 14) == 0) {
 	  if((q96.d1 >> 14) == 0)
 	  {
 		/* Integer-truncation-on-store should obviate the need to subtract 1 from q, and (double)q is only accurate to 53 bits to begin with): */
-		ASSERT(HERE, x128.d1 == 0, "High half of exactly-computed k nonzero!");
+		ASSERT(x128.d1 == 0, "High half of exactly-computed k nonzero!");
 		dbl = (double)q96.d0 + (double)q96.d1*TWO64FLOAT;
 		dbl /= (2.0*p64);
 		rnd = DNINT(dbl);
 		k = (uint64)rnd;
-		ASSERT(HERE, x128.d0 == k, "Approx and exactly-computed k differ!");
+		ASSERT(x128.d0 == k, "Approx and exactly-computed k differ!");
 		res64 = twopmodq78_3WORD_DOUBLE(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  }
 	#endif
@@ -1387,16 +1387,16 @@ if((q128.d1 >> 14) == 0) {
 		res96 = twopmodq96(p64, k);
 		if(!CMPEQ96(ONE96,res96))
 		{
-			fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k,
+			fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k,
 					&cbuf2[convert_uint96_base10_char(cbuf2, res96)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128_96(p64, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	}
 		p128.d0 = p64; p128.d1 = 0;
@@ -1407,15 +1407,15 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, p64)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, res128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 	if(x128.d1 == 0) {
 		res64 = twopmodq128x2((uint64 *)&p128, k);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq128x2(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq128x2(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	}
 
@@ -1424,8 +1424,8 @@ if((q128.d1 >> 14) == 0) {
 		res64 = twopmodq100_2WORD_DOUBLE(p64, q128);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	#endif
 
@@ -1437,8 +1437,8 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq78_3WORD_DOUBLE_q2(p64, k,k, 0,0);
 			if(res64 != 3)
 			{
-				fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 		}
 	  #endif
@@ -1447,8 +1447,8 @@ if((q128.d1 >> 14) == 0) {
 		res64 = twopmodq100_2WORD_DOUBLE_q2(p64, k,k);
 		if(res64 != 3)
 		{
-			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+			ASSERT(0,"0");
 		}
 	  #endif
 
@@ -1462,8 +1462,8 @@ if((q128.d1 >> 14) == 0) {
 				res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0);
 				if(res64 != 15)
 				{
-					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-					ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+					ASSERT(0,"0");
 				}
 			}
 		#endif
@@ -1472,21 +1472,21 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 		#endif
 			res64 = twopmodq96_q4(p64,k,k,k,k, 0,0);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 			res64 = twopmodq128_96_q4(p64,k,k,k,k);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq128_96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq128_96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 		}	// k must be 64-bit
 
@@ -1500,30 +1500,30 @@ if((q128.d1 >> 14) == 0) {
 				res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0);
 				if(res64 != 255)
 				{
-					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%llu, k = %llu x 4 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-					ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+					ASSERT(0,"0");
 				}
 			}
 		#endif
 			res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 			res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq128_96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq128_96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 			res64 = twopmodq128_q8((uint64 *)&p128,k,k,k,k,k,k,k,k);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = %#1X.\n",
 					&cbuf0[convert_uint128_base10_char(cbuf0,p128)],
 					&cbuf1[convert_uint128_base10_char(cbuf1,q128)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}	// k must be 64-bit
 
@@ -1536,8 +1536,8 @@ if((q128.d1 >> 14) == 0) {
 				res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0);
 				if(res64 != 0xffff)
 				{
-					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-					ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+					ASSERT(0,"0");
 				}
 			}
 		#else
@@ -1549,8 +1549,8 @@ if((q128.d1 >> 14) == 0) {
 			if((q96.d1 >> 14) == 0) {
 				res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0);
 				if(res64 != 0xffffffff) {
-					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-					ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+					ASSERT(0,"0");
 				}
 			}
 		}	// k must be 52-bit or less
@@ -1559,8 +1559,8 @@ if((q128.d1 >> 14) == 0) {
 			if((q96.d1 >> 14) == 0) {
 				res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0);
 				if(res64 != 0xffffffffffffffff) {
-					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64);
-					ASSERT(HERE, 0,"0");
+					fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64);
+					ASSERT(0,"0");
 				}
 			}
 		}	// k must be 52-bit or less
@@ -1586,7 +1586,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, p64)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint64_base10_char (cbuf2, res64)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 		q128.d0 += 1ull;
 
@@ -1600,7 +1600,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint128_base10_char(cbuf0, p128)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, x128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		/* To find the quotient k = (q-1)/(2*p), which may be > 64 bits, use mod-inverse with base 2^128 arithmetic.
@@ -1635,7 +1635,7 @@ if((q128.d1 >> 14) == 0) {
 			fprintf(stderr,"ERROR: Illegal (p,k) mod 60 pair: p, p mod 60, q128, k mod 60 = %s %4u %s %4u\n",
 					&cbuf0[convert_uint64_base10_char (cbuf0,  p64)], pm60,
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)], km60);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res128 = twopmodq128(p128, q128);
@@ -1645,7 +1645,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, p64)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, res128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 
 		#if 0
 			/* 10^31 in binary form - need this to reconstruct large factors that were truncated at 30 digits in the PrimeNet report printout: */
@@ -1773,27 +1773,27 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq128x2((uint64 *)&p128, k);
 			if(res64 != 1)
 			{
-				fprintf(stderr,"ERROR: twopmodq128x2(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				fprintf(stderr,"ERROR: twopmodq128x2(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64);
+				ASSERT(0,"0");
 			}
 
 		#if(TRYQ == 4)
 			res64 = twopmodq128_q4((uint64 *)&p128,k,k,k,k);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq128_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq128_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 					&cbuf0[convert_uint128_base10_char(cbuf0,p128)],
 					&cbuf1[convert_uint128_base10_char(cbuf1,q128)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		#elif(TRYQ == 8)
 			res64 = twopmodq128_q8((uint64 *)&p128,k,k,k,k,k,k,k,k);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n",
+				fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n",
 					&cbuf0[convert_uint128_base10_char(cbuf0,p128)],
 					&cbuf1[convert_uint128_base10_char(cbuf1,q128)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		#endif
 		}	// k must be 64-bit
@@ -1840,7 +1840,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, q128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128x2B((uint64*)&p128, q128);
@@ -1850,7 +1850,7 @@ if((q128.d1 >> 14) == 0) {
 					i,i2, fac63[i].p, fac64[i2].p,
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)], (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	  }
 	}
@@ -1893,7 +1893,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac64[i].q)],
 					&cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, q128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128x2B((uint64*)&p128, q128);
@@ -1903,7 +1903,7 @@ if((q128.d1 >> 14) == 0) {
 					i,i2, fac64[i].p, fac64[i2].p,
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac64[i].q)],
 					&cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)], (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	  }
 	}
@@ -1943,7 +1943,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, x128)]);
 			fprintf(stderr," q128.d1 += fac63[i].q overflows!\n");
-			ASSERT(HERE, q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q");	/* Make sure sum didn't overflow */
+			ASSERT(q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q");	/* Make sure sum didn't overflow */
 		}
 
 		/* Skip the q%(2*p) == 1 and (p%60,q%60) checks, as they don't apply
@@ -1959,7 +1959,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, x128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, res128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res64 = twopmodq128x2B((uint64*)&p128, q128);
@@ -1969,7 +1969,7 @@ if((q128.d1 >> 14) == 0) {
 					i,i2, fac63[i].p, fac65[i2].p,
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, x128)], (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	  }
 	}
@@ -1995,7 +1995,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint128_base10_char(cbuf0, p128)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, x128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res128 = twopmodq128(p128, q128);
@@ -2005,7 +2005,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint128_base10_char(cbuf0, p128)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, q128)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, res128)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -2031,18 +2031,18 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, x192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		// Now compute k = (q-1)/2p, while verifying that q%2p = 1:
 		mi64_div((uint64*)&q192, (uint64*)&two_p192, 3,3, (uint64*)&x192, (uint64*)&res192);	// x192 contains k
-		ASSERT(HERE, x192.d2 == 0 && x192.d1 == 0,"k > 2^64!");
+		ASSERT(x192.d2 == 0 && x192.d1 == 0,"k > 2^64!");
 		if(!CMPEQ192(res192, ONE192))
 		{
 			fprintf(stderr,"ERROR: twopmodq192( %s, %s ) returns non-unity result!\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		/* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */
@@ -2051,38 +2051,38 @@ if((q128.d1 >> 14) == 0) {
 		res64 = twopmodq160_q4(p192,q192,q192,q192,q192);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+			fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 */
 		res64 = twopmodq192_q4((uint64*)&p192,x192.d0,x192.d0,x192.d0,x192.d0);
 		if(res64 != 15)
 		{
-			fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+			fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-			ASSERT(HERE, 0,"0");	// *** disable this to allow fast-UMULH192 timing-testing ***
+			ASSERT(0,"0");	// *** disable this to allow fast-UMULH192 timing-testing ***
 		}
 	#elif(TRYQ == 8)
 /*
 		res64 = twopmodq160_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n",
+			fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 */
 		res64 = twopmodq192_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0);
 		if(res64 != 255)
 		{
-			fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n",
+			fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	#endif
 	}
@@ -2106,7 +2106,7 @@ if((q128.d1 >> 14) == 0) {
 			fprintf(stderr,"ERROR: q != 1 modulo p for M( %s ), q = %s \n",
 					&cbuf0[convert_uint64_base10_char (cbuf0, p64)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		// Now compute k = (q-1)/2p, while verifying that q%2p = 1:
@@ -2117,7 +2117,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, res192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 /*
 		res192 = twopmodq160(p192, q192);
@@ -2127,7 +2127,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, res192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 */
 		res192 = twopmodq192(p192, q192);
@@ -2137,7 +2137,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, res192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		/* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */
@@ -2148,19 +2148,19 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq160_q4(p192,x192.d0,x192.d0,x192.d0,x192.d0);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 						&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 						&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		*/
 			res64 = twopmodq192_q4((uint64*)&p192,x192.d0,x192.d0,x192.d0,x192.d0);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 						&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 						&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 	#elif(TRYQ == 8)
@@ -2170,19 +2170,19 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq160_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n",
+				fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n",
 						&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 						&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		*/
 			res64 = twopmodq192_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n",
+				fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n",
 						&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 						&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 	#endif
@@ -2212,7 +2212,7 @@ if((q128.d1 >> 14) == 0) {
 			fprintf(stderr,"ERROR: q != 1 modulo p for M( %s ), q = %s \n",
 					&cbuf0[convert_uint64_base10_char (cbuf0, p64)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		// Now compute k = (q-1)/2p, while verifying that q%2p = 1:
@@ -2223,7 +2223,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, res192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res192 = twopmodq192(p192, q192);
@@ -2233,7 +2233,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)],
 					&cbuf2[convert_uint192_base10_char(cbuf2, res192)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 	/* this is currently sse2/msvc only :
@@ -2242,10 +2242,10 @@ if((q128.d1 >> 14) == 0) {
 			x256 = twopmodq200_8WORD_DOUBLE((uint64*)&p192, x192.d0);	res64 = !x256.d3 && (uint64)CMPEQ192(x256, ONE192);
 			if(res64 != 1)
 			{
-				fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %llu\n",
+				fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %" PRIu64 "\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1, q192)], res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 	*/
@@ -2263,7 +2263,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint256_base10_char(cbuf0, p256)],
 					&cbuf1[convert_uint256_base10_char(cbuf1, q256)],
 					&cbuf2[convert_uint256_base10_char(cbuf2, x256)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		res256 = twopmodq256(p256, q256);
@@ -2273,7 +2273,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint256_base10_char(cbuf0, p256)],
 					&cbuf1[convert_uint256_base10_char(cbuf1, q256)],
 					&cbuf2[convert_uint256_base10_char(cbuf2, res256)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 
 		/* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */
@@ -2283,10 +2283,10 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq192_q4((uint64*)&p192,x192.d0,x192.d0,x192.d0,x192.d0);
 			if(res64 != 15)
 			{
-				fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 						&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 						&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 	#elif(TRYQ == 8)
@@ -2295,10 +2295,10 @@ if((q128.d1 >> 14) == 0) {
 			res64 = twopmodq192_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0);
 			if(res64 != 255)
 			{
-				fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n",
+				fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n",
 						&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 						&cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 	#endif
@@ -2342,7 +2342,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, x128)]);
 			fprintf(stderr," q128.d1 += fac63[i].q overflows!\n");
-			ASSERT(HERE, q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q");	/* Make sure sum didn't overflow */
+			ASSERT(q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q");	/* Make sure sum didn't overflow */
 		}
 
 		/* Now multiply the 128-bit 63x65-bit factor product by each 64-bit test factor in turn. */
@@ -2384,7 +2384,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf2[convert_uint128_base10_char(cbuf2, x128)],
 					&cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)],
 					&cbuf4[convert_uint192_base10_char(cbuf4, q192)]);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 
 			p256.d0 = p192.d0;	q256.d0 = q192.d0;
@@ -2401,7 +2401,7 @@ if((q128.d1 >> 14) == 0) {
 						&cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)],
 						&cbuf4[convert_uint256_base10_char(cbuf4, q256)],
 						&cbuf5[convert_uint256_base10_char(cbuf5, res256)]);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 
 			/* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */
@@ -2413,10 +2413,10 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf1[convert_uint64_base10_char (cbuf1, fac63[i].q)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, x128)],
 					&cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)]);
-				fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1,q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		#elif(TRYQ == 8)
 			res64 = twopmodq192_q8(p192,q192,q192,q192,q192,q192,q192,q192,q192);
@@ -2426,10 +2426,10 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf1[convert_uint64_base10_char (cbuf1, fac63[i].q)],
 					&cbuf2[convert_uint128_base10_char(cbuf2, x128)],
 					&cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)]);
-				fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n",
+				fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 4 ) failed to find factor, res = %#1X.\n",
 					&cbuf0[convert_uint192_base10_char(cbuf0, p192)],
 					&cbuf1[convert_uint192_base10_char(cbuf1,q192)], (uint32)res64);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		#endif
 		}
@@ -2451,7 +2451,7 @@ if((q128.d1 >> 14) == 0) {
 	{
 		p256 = convert_base10_char_uint256(fac256[i].p);	ADD256(p256,p256,two_p256);
 		q256 = convert_base10_char_uint256(fac256[i].q);
-		ASSERT(HERE, CMPEQ256(xmody256(q256, two_p256, &x256), ONE256), "ERROR: q%(2p) != 1");
+		ASSERT(CMPEQ256(xmody256(q256, two_p256, &x256), ONE256), "ERROR: q%(2p) != 1");
 		res256 = twopmodq256(p256, q256);
 		if(!CMPEQ256(res256, ONE256))
 		{
@@ -2459,7 +2459,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint256_base10_char(cbuf0, p256)],
 					&cbuf1[convert_uint256_base10_char(cbuf1, q256)],
 					&cbuf2[convert_uint256_base10_char(cbuf2, res256)]);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	#if 0	/************* need to use k-based for FP200! **********/
 	/* this is currently sse2/msvc only :
@@ -2468,14 +2468,14 @@ if((q128.d1 >> 14) == 0) {
 	  {
 		p128.d0 = p192.d0;
 		p128.d1 = p192.d1;
-	printf("twopmodq200, p = %s, k = %llu\n", fac256->p, x256.d0);
+	printf("twopmodq200, p = %s, k = %" PRIu64 "\n", fac256->p, x256.d0);
 		x256 = twopmodq200_8WORD_DOUBLE(p128, x256.d0);	res64 = !x256.d3 && (uint64)CMPEQ192(x256, ONE192);
 		if(res64 != 1)
 		{
-			fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %llu\n",
+			fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %" PRIu64 "\n",
 					&cbuf0[convert_uint256_base10_char(cbuf0, p256)],
 					&cbuf1[convert_uint256_base10_char(cbuf1, q256)], res64);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	  }
 	*/
@@ -2518,7 +2518,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)],
 					&cbuf1[convert_uint128_base10_char(cbuf1, x128)]);
 			fprintf(stderr," q128.d1 += fac63[i].q overflows!\n");
-			ASSERT(HERE, q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q");	/* Make sure sum didn't overflow */
+			ASSERT(q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q");	/* Make sure sum didn't overflow */
 		}
 
 		/* Now multiply the 128-bit 63x65-bit factor product by the product of each pair of 64-bit test factors in turn. */
@@ -2555,7 +2555,7 @@ if((q128.d1 >> 14) == 0) {
 					&cbuf4[convert_uint64_base10_char (cbuf4, fac64[jj].q)],
 					&cbuf5[convert_uint256_base10_char(cbuf5, q256)],
 					&cbuf6[convert_uint256_base10_char(cbuf6, res256)]);
-					ASSERT(HERE, 0,"0");
+					ASSERT(0,"0");
 				}
 			}
 		}
diff --git a/src/fermat_mod_square.c b/src/fermat_mod_square.c
index fe6d3f32..1cae6c6a 100644
--- a/src/fermat_mod_square.c
+++ b/src/fermat_mod_square.c
@@ -241,7 +241,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 	// v20: got rid of 1st constraint, so we can use a single mode_flag value in p-1 stage 2 for both vecs we want to fwd-FFT-only
 	//      but input in fwd-FFT-pass-1-already-done mode and ones where we do both FFTs, input in said form and left so on return:
 	//	if(fwd_fft == 1ull)
-	//		ASSERT(HERE, mode_flag < 2, "Only low bit of mode_flag field may be used in this case!");
+	//		ASSERT(mode_flag < 2, "Only low bit of mode_flag field may be used in this case!");
 	}
 
 	/* These came about as a result of multithreading, but now are needed whether built unthreaded or multithreaded */
@@ -266,12 +266,12 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 #endif
 
 #ifdef USE_IMCI512	// 1st-gen Xeon Phi - Use modified 8x8 doubles-transpose algo [1a] from util.c:test_simd_transpose_8x8()
-	ASSERT(HERE,0,"Fermat-mod unsupported in k1om / IMCI-512 build mode!");
+	ASSERT(0,"Fermat-mod unsupported in k1om / IMCI-512 build mode!");
 	exit(1);
 #endif
 	radix0 = RADIX_VEC[0];
 	nchunks = radix0;
-	ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE, "fermat_mod_square: Incorrect TRANSFORM_TYPE!");
+	ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE, "fermat_mod_square: Incorrect TRANSFORM_TYPE!");
 
 /*...initialize things upon first entry */
 
@@ -296,26 +296,26 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		N2 =n/2;		/* Complex vector length.	*/
 		if(!arr_scratch) {
 			sprintf(cbuf, "Init portion of %s requires non-null scratch array!",func);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		}
 		for(i = 0; i < NRADICES; i++) {
 			if(RADIX_VEC[i] == 0) {
 				sprintf(cbuf, "%s: RADIX_VEC[i = %d] zero, for i < [NRADICES = %d]!",func,i,NRADICES);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			radix_set_save[i] = RADIX_VEC[i];
 		}
 		for(i = NRADICES; i < 10; i++) {
 			if(RADIX_VEC[i] != 0) {
 				sprintf(cbuf, "%s: RADIX_VEC[i = %d] nonzero, for i >= [NRADICES = %d]!",func,i,NRADICES);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			radix_set_save[i] = 0;
 		}
 
 		/*...Check that the binary exponent corresponds to a proper Fermat index: */
 		findex = trailz64(p);
-		ASSERT(HERE, p >> findex == 1,"fermat_mod_square.c: p >> findex == 1");
+		ASSERT(p >> findex == 1,"fermat_mod_square.c: p >> findex == 1");
 
 		// Set function pointers for DIF|DIT pass1:
 		dif1_dit1_func_name( radix0, &func_dif1, &func_dit1 );
@@ -323,18 +323,18 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		/* My array padding scheme requires N/radix0 to be a power of 2, and to be >= 2^DAT_BITS, where the latter
 		parameter is set in the Mdata.h file: */
 		if(n%radix0 != 0) {
-			sprintf(cbuf  ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf  ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf);
 		}
 		/* Make sure n/radix0 is a power of 2: */
 		i = n/radix0;
 		if((i >> trailz32(i)) != 1) {
-			sprintf(cbuf  ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf  ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf);
 		}
 
 		if(DAT_BITS < 31) {
 			/* Now make sure n/radix0 is sufficiently large (unless n < 2^DAT_BITS, in which case it doesn't matter): */
 			if(i < (1 << DAT_BITS)) {
-			//	sprintf(cbuf  ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS));	fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			//	sprintf(cbuf  ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS));	fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				// Mar 2018: Switch to 'soft' assertion error here, e.g. for timing tests at small FFT lengths:
 				sprintf(cbuf  ,"n/radix0 must be >= %u! Skipping this radix combo.\n", (1 << DAT_BITS));	WARN(HERE, cbuf, "", 1); return(ERR_ASSERT);
 			}
@@ -342,7 +342,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if((1 << DAT_BITS) < 2*RADIX_VEC[NRADICES-1]) {
 				sprintf(cbuf  ,"ERROR: Value of DAT_BITS means final FFT radix may not exceed = %u!\n", (1 << (DAT_BITS-1)));
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 		}
 
@@ -374,7 +374,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		if(mm*RADIX_VEC[NRADICES-1] != N2) {
 			sprintf(cbuf  ,"ERROR: product of radices not equal to complex vector length\n");
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 	/*	index_ptmp = ALLOC_INT(index_ptmp, k);	<*** Jan 2020: Started getting this error here, NFC as to why:
 			malloc: *** error for object 0x100802608: incorrect checksum for freed object - object was probably modified after being freed.
@@ -383,7 +383,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		{
 			sprintf(cbuf  ,"ERROR: unable to allocate array INDEX in %s.\n",func);
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 		index = ALIGN_INT(index_ptmp);
 	*/
@@ -536,7 +536,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		default:
 			sprintf(cbuf  ,"ERROR: radix %d not available for Fermat-mod transform. Halting...\n",RADIX_VEC[i]);
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		for(i = 1; i < NRADICES; i++)
@@ -576,7 +576,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			default:
 				sprintf(cbuf  ,"ERROR: intermediate radix %d not available. Halting...\n",RADIX_VEC[i]);
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 
 			/* Final radix must be 16 or 32: */
@@ -584,7 +584,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			{
 				sprintf(cbuf  ,"ERROR: final radix %d not available. Halting...\n",RADIX_VEC[i]);
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 		}
 		nradices_prim = l;	for( ; l < 30; l++) { radix_prim[l] = 0; }	// Zero any higher elements which may have been previously set due
@@ -620,8 +620,8 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			baseinv[0] = (double)(1.0/base[0]    );	baseinv[1] = (double)(1.0/base[1]);	/* don't need extended precision for this since both bases are powers of 2.	*/
 
 			/*...stuff for the reduced-length DWT weights arrays is here:	*/
-			wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt);	if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp);
-			wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, nwt);	if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp);
+			wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt);	if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp);
+			wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, nwt);	if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp);
 
 		/******************************************************************/
 		/* Crandall/Fagin weighting factors and number of bits per digit. */
@@ -629,7 +629,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 
 			/* Double-check that sw*nwt (where nwt is the odd factor of N) is divisible by N: */
 		//	printf("sw,nwt,n = %u,%u,%u; sw*nwt mod n = %u\n",sw,nwt,n, (uint64)sw*nwt % n);
-			ASSERT(HERE, (uint64)sw*nwt % n == 0,"fermat_mod_square.c: sw*nwt % n == 0");
+			ASSERT((uint64)sw*nwt % n == 0,"fermat_mod_square.c: sw*nwt % n == 0");
 			SW_DIV_N = sw*nwt/n;
 
 			qn   = i64_to_q((int64) nwt);
@@ -652,7 +652,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: QWT = %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 
 			for(i = 0; i < nwt; i++)
@@ -672,7 +672,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 					if(idiff > max_idiff)
 						max_idiff = idiff;
 					sprintf(cbuf,"INFO: I = %8d: QWT0 = %20.15f, DWT0 = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-					fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 
 				/* Inverse DWT weight factor:	*/
@@ -692,7 +692,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 					if(idiff > max_idiff)
 						max_idiff = idiff;
 					sprintf(cbuf,"INFO: I = %8d: QWT1 = %20.15f, DWT1 = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-					fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 				qwt= qfmul(qwt, qmul);
 			}
@@ -705,14 +705,14 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		/* No need for a fancy NINT here: */
 		NRT_BITS = (uint32)(log(sqrt(1.0*n))/log(2.0) + 0.5);
 		NRT = 1 << NRT_BITS;
-		if(n%NRT){ sprintf(cbuf,"ERROR: NRT does not divide N!\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(n%NRT){ sprintf(cbuf,"ERROR: NRT does not divide N!\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		NRTM1 = NRT - 1;
 
 		/*...The rt0 array stores the (0:NRT-1)th powers of the [N2]th root of unity
 		(i.e. will be accessed using the lower (NRT) bits of the integer sincos index):
 		*/
 		rt0_ptmp = ALLOC_COMPLEX(rt0_ptmp, NRT);
-		if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		rt0 = ALIGN_COMPLEX(rt0_ptmp);
 
 		qt     = i64_to_q((int64)N2);
@@ -736,7 +736,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QCOS1= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		t1 = qfdbl(qi);
@@ -753,7 +753,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QSIN1= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		qt = QZRO;
@@ -775,7 +775,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt0[i].re = t1;
 
@@ -794,7 +794,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt0[i].im = t1;
 
@@ -811,7 +811,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		(and will be accessed using the upper bits, <NRT:31>, of the integer sincos index):
 		*/
 		rt1_ptmp = ALLOC_COMPLEX(rt1_ptmp, n/(2*NRT));
-		if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		rt1 = ALIGN_COMPLEX(rt1_ptmp);
 
 		qn     = i64_to_q((int64)NRT);
@@ -837,7 +837,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QCOS2= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		t1 = qfdbl(qi);
@@ -854,7 +854,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QSIN2= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		qt = QZRO;
@@ -876,7 +876,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt1[i].re = t1;
 
@@ -895,7 +895,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt1[i].im = t1;
 
@@ -916,7 +916,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		/*...The rn0 array stores the (0:NRT-1)th powers of the [2*n]th root of unity
 		(i.e. will be accessed using the lower (NRT) bits of the integer sincos index):
 		*/
-		rn0_ptmp = ALLOC_COMPLEX(rn0_ptmp, NRT);	if(!rn0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); } rn0 = ALIGN_COMPLEX(rn0_ptmp);
+		rn0_ptmp = ALLOC_COMPLEX(rn0_ptmp, NRT);	if(!rn0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); } rn0 = ALIGN_COMPLEX(rn0_ptmp);
 
 		qt     = i64_to_q((int64)N2);
 		qtheta = qfdiv(QPIHALF, qt);	/* (2*pi)/(2*N) = (pi/2)/(N/2) */
@@ -938,7 +938,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QCOS1= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		t1 = qfdbl(qi);
@@ -954,7 +954,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QSIN1= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		qt = QZRO;
@@ -976,7 +976,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rn0[i].re = t1;
 
@@ -994,7 +994,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rn0[i].im = t1;
 
@@ -1010,7 +1010,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		/*...The rn1 array stores the (0:(n/2)/NRT-1)th powers of the [(n/2)/NRT]th root of unity
 		(and will be accessed using the upper bits, <NRT:31>, of the integer sincos index):
 		*/
-		rn1_ptmp = ALLOC_COMPLEX(rn1_ptmp, N2/NRT);	if(!rn1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); } rn1 = ALIGN_COMPLEX(rn1_ptmp);
+		rn1_ptmp = ALLOC_COMPLEX(rn1_ptmp, N2/NRT);	if(!rn1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); } rn1 = ALIGN_COMPLEX(rn1_ptmp);
 
 		qn     = i64_to_q((int64)NRT);
 		qt     = i64_to_q((int64)N2);
@@ -1034,7 +1034,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QCOS2= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		t1 = qfdbl(qi);
@@ -1050,7 +1050,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QSIN2= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		qt = QZRO;
@@ -1072,7 +1072,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rn1[i].re = t1;
 
@@ -1090,7 +1090,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rn1[i].im = t1;
 
@@ -1108,7 +1108,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 			fprintf(stderr, "%s:\n",func);
 			fprintf(stderr, " Max abs error between real*8 and real*16 computed values = %20.15f\n",         max_adiff);
 			fprintf(stderr, " Max bit error between real*8 and real*16 computed values = %20.0f \n", (double)max_idiff);
-			ASSERT(HERE, (max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting.");
+			ASSERT((max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting.");
 		}
 
 	#ifdef MULTITHREAD
@@ -1141,13 +1141,13 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		}
 
 		// Threadpool-based dispatch:
-		ASSERT(HERE, MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!");
+		ASSERT(MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!");
 
 		if(radix0 % NTHREADS != 0) fprintf(stderr,"%s: radix0 not exactly divisible by NTHREADS - This will hurt performance.\n",func);
 
 		main_work_units = 0;
 		pool_work_units = radix0;
-		ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+		ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 		printf("%s: Init threadpool of %d threads\n",func,NTHREADS);
 
 	#endif	// MULTITHREAD?
@@ -1192,11 +1192,11 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 	// v20: Add support for mod_mul with one input being in precomputed fwd-FFTed form:
 #ifdef MULTITHREAD
 	for(i = 0; i < nchunks; ++i) { tdat[i].arrdat = a; tdat[i].fwd_fft = fwd_fft; tdat[i].c = c; }
-//	printf("Thread 0: arrdat = 0x%llX, fwd_fft = 0x%llX\n",tdat[0].arrdat,tdat[0].fwd_fft);
+//	printf("Thread 0: arrdat = %#" PRIX64 ", fwd_fft = %#" PRIX64 "\n",tdat[0].arrdat,tdat[0].fwd_fft);
 #endif
 
 	/*...Init clock counter:	*/
-	ASSERT(HERE, tdiff != 0,"fermat_mod_square.c: tdiff != 0");
+	ASSERT(tdiff != 0,"fermat_mod_square.c: tdiff != 0");
 
 #ifdef CTIME
 	clock1 = clock();
@@ -1216,7 +1216,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 		goto undo_initial_ffft_pass;
 	if((mode_flag & 1) == 0)
 	{
-	//	fprintf(stderr,"Array = 0x%llX, Iter = %u, Fwd-WT: mode_flag = 0x%X, ilo = %u, a[1] = %18.10f\n",(uint64)a,ilo+1,mode_flag,ilo,a[1]);
+	//	fprintf(stderr,"Array = %#" PRIX64 ", Iter = %u, Fwd-WT: mode_flag = %#X, ilo = %u, a[1] = %18.10f\n",(uint64)a,ilo+1,mode_flag,ilo,a[1]);
 		// Mar 2017: Can skip this step if it's the start of a production test (note that any initial-residue shift
 		// in such cases is handled via single-array-word forward-DWT-weighting in the Mlucas.c shift_word() function),
 		// but need it if add RNG-input-setting above for debug, hence also check a[1] for nonzero:
@@ -1236,14 +1236,14 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 					j1 = j;
 				#endif
 					j1 = j1 + ( (j1>> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
-					ASSERT(HERE, DNINT(a[j1]) == a[j1],"fermat_mod_square.c: Input a[j] noninteger!");
+					ASSERT(DNINT(a[j1]) == a[j1],"fermat_mod_square.c: Input a[j] noninteger!");
 					wt = wt0[ii];
 					a[j1] *= wt;
 					ii += SW_DIV_N - nwt;
 					ii += ( (-(int)((uint32)ii >> 31)) & nwt);
 				}
 				/* Odds: */
-				ASSERT(HERE, ii == 0,"fermat_mod_square.c: ii == 0");
+				ASSERT(ii == 0,"fermat_mod_square.c: ii == 0");
 				for(j = 0; j < n; j += 2)
 				{
 				#ifdef USE_AVX512
@@ -1257,7 +1257,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 				#endif
 					j1 = j1 + ( (j1>> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
 					j2 = j1 + RE_IM_STRIDE;
-					ASSERT(HERE, DNINT(a[j2]) == a[j2],"fermat_mod_square.c: Input a[j] noninteger!");
+					ASSERT(DNINT(a[j2]) == a[j2],"fermat_mod_square.c: Input a[j] noninteger!");
 					wt = wt0[ii];
 					a[j2] *= wt;
 					ii += SW_DIV_N - nwt;
@@ -1310,7 +1310,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui
 	*/
 	ierr = 0;	/* Any return-value error code (whether fatal or not) stored here */
 
-	ASSERT(HERE, ihi > ilo,"ferm_mod_square.c: ihi <= ilo!");
+	ASSERT(ihi > ilo,"ferm_mod_square.c: ihi <= ilo!");
 
   #if DBG_THREADS
 	fprintf(stderr,"%s: NTHREADS = %3d\n",func,NTHREADS);
@@ -1354,7 +1354,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 	while(tpool->free_tasks_queue.num_tasks != pool_work_units) {
 	//		sleep(1);	//*** too granular ***
 		// Finer-resolution, declared in <time.h>; cf. http://linux.die.net/man/2/nanosleep
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!");
 	//	printf("sleep; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 	}
 //	printf("end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
@@ -1389,7 +1389,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 		MOD_ADD64(RES_SHIFT,RES_SHIFT,p,RES_SHIFT);
 		RES_SHIFT += ((BASE_MULTIPLIER_BITS[i>>6] >> (i&63)) & 1);	// No mod needed on this add, since result of pvs line even and < p, which is itself even in the Fermat-mod case (p = 2^m)
 		const char flip[2] = {' ','*'};
-//	printf("Iter %d: shift = [%c]%llu\n",iter,flip[RES_SIGN],RES_SHIFT);
+//	printf("Iter %d: shift = [%c]%" PRIu64 "\n",iter,flip[RES_SIGN],RES_SHIFT);
 	#endif
 	}
 /*...Do the final inverse FFT pass, carry propagation and initial forward FFT pass in one fell swoop, er, swell loop...	*/
@@ -1473,7 +1473,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 			ierr = radix4096_ditN_cy_dif1    (a,n,nwt,nwt_bits,wt0,wt1,0x0,rn0,rn1,base,baseinv,iter,&fracmax,p); break;
 	*/
 		default:
-			sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 	}
 
 	// v19: Nonzero exit carries used to be fatal, added retry-from-last-savefile handling for these
@@ -1566,12 +1566,12 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 // On early-exit-due-to-interrupt, decrement iter since we didn't actually do the (iter)th iteration
 if(!MLUCAS_KEEP_RUNNING) iter--;
 if(iter < ihi) {
-	ASSERT(HERE, !MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!");
+	ASSERT(!MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!");
 	ierr = ERR_INTERRUPT;
 	ROE_ITER = iter;	// Function return value used for error code, so save number of last-iteration-completed-before-interrupt here
-//	fprintf(stderr,"Caught signal at iter = %u; mode_flag = 0x%X\n",iter,mode_flag);
+//	fprintf(stderr,"Caught signal at iter = %u; mode_flag = %#X\n",iter,mode_flag);
 	mode_flag &= 0xfffffffd;	// v20: In case of interrupt-exit override any mode_flag "skip undo of initial DIF pass" setting
-//	fprintf(stderr,"After ^2-toggle, mode_flag = 0x%X, (mode_flag >> 1) = 0x%X\n",mode_flag,mode_flag>>1);
+//	fprintf(stderr,"After ^2-toggle, mode_flag = %#X, (mode_flag >> 1) = %#X\n",mode_flag,mode_flag>>1);
 }
 
 #ifdef RTIME
@@ -1600,10 +1600,10 @@ if(iter < ihi) {
 
 	// v20: Add support for fwd_fft_only|mode_flag as described in top-of-function comments
 undo_initial_ffft_pass:
-//	printf("Iter %u: ierr = %u, fwd_fft = %llu, mode_flag = %u\n",iter,ierr,fwd_fft,mode_flag);
+//	printf("Iter %u: ierr = %u, fwd_fft = %" PRIu64 ", mode_flag = %u\n",iter,ierr,fwd_fft,mode_flag);
 	if((mode_flag >> 1) == 0)
 	{
-	//	fprintf(stderr,"Array = 0x%llX, Iter = %u, Inv-WT: mode_flag = 0x%X\n",(uint64)a,iter,mode_flag);
+	//	fprintf(stderr,"Array = %#" PRIX64 ", Iter = %u, Inv-WT: mode_flag = %#X\n",(uint64)a,iter,mode_flag);
 		func_dit1(a,n);
 
 	/*...and unweight the data array.	*/
@@ -1636,7 +1636,7 @@ if(iter < ihi) {
 				ii += ( (-(int)((uint32)ii >> 31)) & nwt);\
 			}
 			/* Odds: */
-			ASSERT(HERE, ii == 0,"fermat_mod_square.c: ii == 0");
+			ASSERT(ii == 0,"fermat_mod_square.c: ii == 0");
 			for(j = 0; j < n; j += 2)
 			{
 			#ifdef USE_AVX512
@@ -1723,7 +1723,7 @@ if(iter < ihi) {
 	// [action] Prior to returning, print a "retry successful" informational and rezero ROE_ITER and ROE_VAL.
 	// *** v20: For PRP-test Must make sure we are at end of checkpoint-file iteration interval, not one of the Gerbicz-update subintervals ***
 	if(!INTERACT && ROE_ITER > 0 && ihi%ITERS_BETWEEN_CHECKPOINTS == 0) {	// In interactive (timing-test) mode, use ROE_ITER to accumulate #iters-with-dangerous-ROEs
-		ASSERT(HERE, (ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!");
+		ASSERT((ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!");
 		ROE_ITER = 0;
 		ROE_VAL = 0.0;
 		sprintf(cbuf,"Retry of iteration interval with fatal roundoff error was successful.\n");
@@ -1789,7 +1789,7 @@ void fermat_process_chunk(
 	dyadic-multiply FFT(a) * FFT(b) and iFFT the product, storing the result in a[].
 	*/
   if((fwd_fft & 0xC) != 0) {
-	ASSERT(HERE, ((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *");
+	ASSERT(((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *");
 	incr = RADIX_VEC[NRADICES-1]<<1;
   }	else {
 	for(i=1; i <= NRADICES-2; i++)
@@ -1805,7 +1805,7 @@ void fermat_process_chunk(
 		case 32:
 			radix32_dif_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break;
 		default:
-			sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 		k    += mm*radix0;
 		mm   *= RADIX_VEC[i];
@@ -1827,7 +1827,7 @@ void fermat_process_chunk(
 		case 32:
 			radix32_dyadic_square(&a[jstart],arr_scratch,n,radix0,rt0,rt1,ii,nradices_prim,radix_prim,incr,init_sse2,thr_id, bptr, cptr); break;
 		default:
-			sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 	}
 #ifdef DBG_TIME
 	clock2 = clock();
@@ -1878,7 +1878,7 @@ void fermat_process_chunk(
 		case 32:
 			radix32_dit_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break;
 		default:
-			sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 	}	/* end i-loop */
 
diff --git a/src/fgt_m61.c b/src/fgt_m61.c
index 224642e5..36c28181 100755
--- a/src/fgt_m61.c
+++ b/src/fgt_m61.c
@@ -104,9 +104,9 @@ uint64 prodq8(const uint64 x, const uint64 y)
 #else
 	MUL_LOHI64(x,y,  lo, hi);
 #endif
-	ASSERT(HERE, (lo & 7) == 0, "ERROR: product not divisible by 8 in PRODQ8!");
-//if(hi + (lo >> 3) > q2) fprintf(stderr, "PRODQ8 inputs: %llu,%llu, outputs: %llu,%llu, result = %llu\n",x,y,lo,hi,hi + (lo >> 3));
-//	ASSERT(HERE, hi + (lo >> 3) <= q2, "ERROR: result out of range in PRODQ8!");
+	ASSERT((lo & 7) == 0, "ERROR: product not divisible by 8 in PRODQ8!");
+//if(hi + (lo >> 3) > q2) fprintf(stderr, "PRODQ8 inputs: %" PRIu64 ",%" PRIu64 ", outputs: %" PRIu64 ",%" PRIu64 ", result = %" PRIu64 "\n",x,y,lo,hi,hi + (lo >> 3));
+//	ASSERT(hi + (lo >> 3) <= q2, "ERROR: result out of range in PRODQ8!");
 	return hi + (lo >> 3);		// hi + (lo/8)
 }
 
@@ -116,7 +116,7 @@ uint64 prodq8(const uint64 x, const uint64 y)
 // [2015: Replace elaborate case-based impl of original with simple MULQ, which is fast on more or less all 64-bit arches.]
 uint64 mul_by_3bit(const uint64 a, const uint64 x)
 {
-	ASSERT(HERE, (x >> 61) == 0, "ERROR: x out of range in MUL_BY_3BIT!");
+	ASSERT((x >> 61) == 0, "ERROR: x out of range in MUL_BY_3BIT!");
 	return a * x;
 }
 
@@ -130,7 +130,7 @@ uint64 mul_by_3bit(const uint64 a, const uint64 x)
 // Output bounds: *********************** To-Do! *********************
 uint64 rmul_modq(const uint64 a, const uint64 b)
 {
-	ASSERT(HERE, a < 0x8000000000000000ull && b < 0x4000000000000000ull, "Input(s) out of range!");
+	ASSERT(a < 0x8000000000000000ull && b < 0x4000000000000000ull, "Input(s) out of range!");
 	return prodq8(a<<1, b<<2);
 }
 
@@ -197,12 +197,12 @@ uint64 rmul_modq(const uint64 x, const uint64 y)
 	hi4est = (uint64)dhi - 1;
 	error_mod4 = ( (bd_lo >> 62) - hi4est ) & 3ull; // Error mod 4
 	bd_hi = (hi4est + error_mod4) >> 2;
-	ASSERT(HERE, bd_lo <= (bd_hi << 3) + bd_lo, "ERROR: overflow of b*d(lo + hi>>3) summand!");
+	ASSERT(bd_lo <= (bd_hi << 3) + bd_lo, "ERROR: overflow of b*d(lo + hi>>3) summand!");
 	bd_modq = qreduce((bd_hi << 3) + bd_lo);
 
 	ay = mul_by_3bit((x >> 58),y);
 	cb = mul_by_3bit((y >> 58), (x & two58m1));
-	ASSERT(HERE, cb <= ay+cb, "ERROR: overflow of ay+cb summand!");
+	ASSERT(cb <= ay+cb, "ERROR: overflow of ay+cb summand!");
 	bd_modq = qreduce((bd_hi << 3) + bd_lo);
 
 	// Now form [(a*y + c*b)*2^58 + b*d] mod q.
@@ -465,24 +465,24 @@ The CMUL_MODQ8 variant assumes the inputs are premultiplied by 8 and thus cuts I
 void cmul_modq(const uint64 a0, const uint64 a1, const uint64 b0, const uint64 b1, uint64*xout, uint64*yout)
 {
 	uint64 t00,t01,t11;
-	ASSERT(HERE, (a0 <= bb && a1 <= bb && b0 <= q && b1 <= q), "ERROR: CMUL_MODQ input out of range!");
+	ASSERT((a0 <= bb && a1 <= bb && b0 <= q && b1 <= q), "ERROR: CMUL_MODQ input out of range!");
 
 	// Bounds: b0,b1 in [0,q], so 4(b0+b1) in [0,8q]; prodq8lo/8 always in [0, q].
 	t00 = prodq8(a0, b0<<3);		// a0    in [0, B]: prodq8hi(a0,8b0) in [0, q], t00 in [0, 2q]
 	t11 = prodq8(a1, b1<<3);		// a1    in [0, B]: prodq8hi(a1,8b1) in [0, q], t11 in [0, 2q]
 	*xout = t00 - t11 + q2;		// xout in [0, 4q]
-	ASSERT(HERE, *xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!");
+	ASSERT(*xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!");
 
 #if !KARATSUBA_CMUL
 	// Standard complex 4-multiply:
 	t01   = prodq8(a0, b1<<3);			// a0    in [0, B]: prodq8hi(a0,8b1) in [0, q], t01 in [0, 2q]
 	*yout = prodq8(a1, b0<<3) + t01;	// a1    in [0, B]: prodq8hi(a1,8b0) in [0, q], t10 in [0, 2q]; yout in [0, 4q]
-	ASSERT(HERE, *yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!");
+	ASSERT(*yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!");
 #else
 	// Karatsuba variant:
 	t01 = prodq8((a0 + a1)<<1, (b0 + b1)<<2);	// prodq8hi( 2(a0+a1) , 4(b0+b1) ) in [0,4q], t01 in [0, 5q]
 	*yout = qreduce(t01 - t00 - t11 + q4);	// t01 in [0, 5q] but t01-t00-t11 in [0,4q], so no overflow in t01-t00-t11+q4.
-	ASSERT(HERE, t01 <= (q4 + q), "ERROR: t01 > 5q in Karatsuba-part of CMUL_MODQ!");
+	ASSERT(t01 <= (q4 + q), "ERROR: t01 > 5q in Karatsuba-part of CMUL_MODQ!");
 	// This version reduces both parts of the output:
   #if 0
 	uint64 tmp = (q<<2) - t11;					// tmp in [2q, 4q]
@@ -501,19 +501,19 @@ void cmul_modq(const uint64 a0, const uint64 a1, const uint64 b0, const uint64 b
 void cmul_modq8(const uint64 a0, const uint64 a1, const uint64 b0, const uint64 b1, uint64*xout, uint64*yout)
 {
 	uint64 t00,t01,t11;
-	ASSERT(HERE, ( a0 <= bb && a1 <= bb ), "ERROR: CMUL_MODQ8 A-input out of range!");
-	ASSERT(HERE, (!(b0 & 7) && !(b1 & 7)), "ERROR: CMUL_MODQ8 B-input not divisible by 8!");
+	ASSERT(( a0 <= bb && a1 <= bb ), "ERROR: CMUL_MODQ8 A-input out of range!");
+	ASSERT((!(b0 & 7) && !(b1 & 7)), "ERROR: CMUL_MODQ8 B-input not divisible by 8!");
 
 	// Bounds: b0,b1 in [0,q], so 4(b0+b1) in [0,8q]; prodq8lo/8 always in [0, q].
 	t00 = prodq8(a0, b0);		// a0    in [0, B]: prodq8hi(a0,8b0) in [0, q], t00 in [0, 2q]
 	t11 = prodq8(a1, b1);		// a1    in [0, B]: prodq8hi(a1,8b1) in [0, q], t11 in [0, 2q]
 	*xout = t00 - t11 + q2;		// xout in [0, 4q]
-	ASSERT(HERE, *xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!");
+	ASSERT(*xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!");
 
 	// Standard complex 4-multiply is only option here:
 	t01   = prodq8(a0, b1);			// a0    in [0, B]: prodq8hi(a0,8b1) in [0, q], t01 in [0, 2q]
 	*yout = prodq8(a1, b0) + t01;	// a1    in [0, B]: prodq8hi(a1,8b0) in [0, q], t10 in [0, 2q]; yout in [0, 4q]
-	ASSERT(HERE, *yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!");
+	ASSERT(*yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!");
 	return;
 }
 
@@ -531,10 +531,10 @@ void csqr_modq(const uint64 a0, const uint64 a1, uint64*xout, uint64*yout)
 {
 	// This version reduces both parts of the output...
 	*xout = prodq8((a0 + a1)<<1, (a0 - a1 + q)<<2);	// prodq8hi( 2(a0+a1) , 4(a0-a1+q) ) in [0,4q]; xout in [0,5q]
-	ASSERT(HERE, *xout <= (q4+q), "ERROR: xout >= 5q in CSQR_MODQ!");
+	ASSERT(*xout <= (q4+q), "ERROR: xout >= 5q in CSQR_MODQ!");
 
 	*yout = prodq8(a0<<2, a1<<2);					// prodq8hi(     4*a0 , 4*a1       ) in [0,2q]; yout in [0,3q]
-	ASSERT(HERE, *yout < (q4-q), "ERROR: yout > 3q in CSQR_MODQ!");
+	ASSERT(*yout < (q4-q), "ERROR: yout > 3q in CSQR_MODQ!");
 }
 
 /***************/
@@ -574,12 +574,12 @@ void prim_root_q(const uint64 ord, uint64*root_re, uint64*root_im)
 	uint64 r0,i0,rm,im,rtmp,itmp,pow;
 
 	// Maximal order (q^2-1) = 2^62 * (2^60-1), allowing power-of-2 roots up to 2^62:
-	ASSERT(HERE, zbits < 63, "PRIM_ROOT_Q: Maximal power-of-2 roots = 2^62!");
+	ASSERT(zbits < 63, "PRIM_ROOT_Q: Maximal power-of-2 roots = 2^62!");
 
 	// First raise result to the [(2^60-1)/(ord >> trailz(ord))]th power using LR binary powering:
 	itmp = (1ull << 60) - 1;
 	pow = itmp/(ord >> zbits);		// Odd component of the needed power; this should have 0 remainder for legal ord values
-	ASSERT(HERE, itmp == pow*(ord >> zbits), "pow does not divide 2^60-1!");
+	ASSERT(itmp == pow*(ord >> zbits), "pow does not divide 2^60-1!");
 	pow = pow << (leadz64(pow)+1);	// Left-justify pow and shift leftmost bit off.
 
 	// 6 + I is a primitive root of full order q^2 - 1:
diff --git a/src/get_cpuid.c b/src/get_cpuid.c
index a305fc27..b1aa3385 100755
--- a/src/get_cpuid.c
+++ b/src/get_cpuid.c
@@ -162,10 +162,10 @@
 			__cpuid(CPUInfo, i);
 		#if 0
 			printf("\nFor InfoType %d\n", i);
-			printf("CPUInfo[0] = 0x%x\n", CPUInfo[0]);
-			printf("CPUInfo[1] = 0x%x\n", CPUInfo[1]);
-			printf("CPUInfo[2] = 0x%x\n", CPUInfo[2]);
-			printf("CPUInfo[3] = 0x%x\n", CPUInfo[3]);
+			printf("CPUInfo[0] = %#x\n", CPUInfo[0]);
+			printf("CPUInfo[1] = %#x\n", CPUInfo[1]);
+			printf("CPUInfo[2] = %#x\n", CPUInfo[2]);
+			printf("CPUInfo[3] = %#x\n", CPUInfo[3]);
 		#endif
 			/* Interpret CPU feature information. */
 			if(i == 1)
@@ -199,10 +199,10 @@
 		{
 			__cpuid(CPUInfo, i);
 			printf("\nFor InfoType %x\n", i);
-			printf("CPUInfo[0] = 0x%x\n", CPUInfo[0]);
-			printf("CPUInfo[1] = 0x%x\n", CPUInfo[1]);
-			printf("CPUInfo[2] = 0x%x\n", CPUInfo[2]);
-			printf("CPUInfo[3] = 0x%x\n", CPUInfo[3]);
+			printf("CPUInfo[0] = %#x\n", CPUInfo[0]);
+			printf("CPUInfo[1] = %#x\n", CPUInfo[1]);
+			printf("CPUInfo[2] = %#x\n", CPUInfo[2]);
+			printf("CPUInfo[3] = %#x\n", CPUInfo[3]);
 
 			/* Interpret CPU brand string and cache information. */
 			if(i == 0x80000002)
diff --git a/src/get_fft_radices.c b/src/get_fft_radices.c
index 9e975d97..e939205a 100755
--- a/src/get_fft_radices.c
+++ b/src/get_fft_radices.c
@@ -2525,7 +2525,7 @@ int	get_fft_radices(uint32 kblocks, int radix_set, uint32 *nradices, uint32 radi
 		 return ERR_FFTLENGTH_ILLEGAL;
 	}
 
-	ASSERT(HERE, rvec[0] <= MAX_RADIX, "Leading radix exceeds value of MAX_RADIX set in Mdata.h file!");
+	ASSERT(rvec[0] <= MAX_RADIX, "Leading radix exceeds value of MAX_RADIX set in Mdata.h file!");
 
 	// Check that there are at least 2 radices:
 	if(numrad < 2) {
@@ -2535,7 +2535,7 @@ int	get_fft_radices(uint32 kblocks, int radix_set, uint32 *nradices, uint32 radi
 
 	/* If user provided a radix array, make sure they gave a valid dimension: */
 	if(radix_vec)
-		ASSERT(HERE, radix_vec_dim >=	numrad,"get_fft_radices: radix_vec_dim has illegal value!");
+		ASSERT(radix_vec_dim >=	numrad,"get_fft_radices: radix_vec_dim has illegal value!");
 
 	/* Check that N/2 = {product of the radices}, and if valid nradices and radix_vec pointers supplied,
 	copy radices to the latter and	numrad to the former: */
@@ -2560,7 +2560,7 @@ int	get_fft_radices(uint32 kblocks, int radix_set, uint32 *nradices, uint32 radi
 	if(rad_prod != n/2)
 	{
 		fprintf(stderr,"N = %u, radix_set = %u : product of complex radices %u != (FFT length/2)\n", n, radix_set, rad_prod);
-		ASSERT(HERE, 0,"0");
+		ASSERT(0,"0");
 	}
 
 	return 0;
@@ -2596,18 +2596,18 @@ void	test_fft_radixtables()
 			}
 			else if(retval == ERR_RADIXSET_UNAVAILABLE)
 			{
-				ASSERT(HERE, radset != 0, "test_fft_radixtables: Should only see ERR_RADIXSET_UNAVAILABLE for nonzero radix set index!");
+				ASSERT(radset != 0, "test_fft_radixtables: Should only see ERR_RADIXSET_UNAVAILABLE for nonzero radix set index!");
 				break;
 			}
 			else if(retval == ERR_FFTLENGTH_ILLEGAL)
 			{
 				fprintf(stderr,"ERROR: illegal FFT length %u K in test_fft_radixtables self-test!\n",kblocks);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 			else
 			{
 				fprintf(stderr,"ERROR: unknown return value %d in test_fft_radixtables self-test; i = %d, kblocks = %u, radset = %u.\n", retval, i, kblocks, radset);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 		++i;
@@ -2624,18 +2624,18 @@ uint32 get_default_fft_length(uint64 p)
 	uint32 leadingRadixVec[N_LEADING_RADICES] = {8,9,10,11,12,13,14,15};
 	uint32 i, twoK, fftLen;
 
-	ASSERT(HERE, PMAX > PMIN,"get_default_fft_length: PMAX > PMIN");
+	ASSERT(PMAX > PMIN,"get_default_fft_length: PMAX > PMIN");
 	if(p < PMIN || p > PMAX)
 	{
-		fprintf(stderr,"get_default_fft_length: invalid value for exponent %llu\n",p);
-		ASSERT(HERE, 0,"0");
+		fprintf(stderr,"get_default_fft_length: invalid value for exponent %" PRIu64 "\n",p);
+		ASSERT(0,"0");
 		return 0;
 	}
 
 	/* Starting with N = 1K, Loop over all FFT lengths of form {8,9,10,11,12,13,14,15}*2^m,
 	and return the smallest one for which maxP >= p: */
 	i = 0;
-	ASSERT(HERE, 1024%leadingRadixVec[i] == 0,"get_default_fft_length: 1024%leadingRadixVec[0] == 0");
+	ASSERT(1024%leadingRadixVec[i] == 0,"get_default_fft_length: 1024%leadingRadixVec[0] == 0");
 	twoK = 1024/leadingRadixVec[i];
 	fftLen = leadingRadixVec[i]*twoK;
 	for(;;)
@@ -2664,7 +2664,7 @@ uint32 get_default_fft_length(uint64 p)
 	if((fftLen >> 10) == 589824)
 		fprintf(stderr,"get_default_fft_length: Allowing fftLen 576M just for informational purposes ... note this length is not supported.\n");
 	else
-		ASSERT(HERE, 0,"get_default_fft_length: fftLen > MAX_FFT_LENGTH_IN_K!");
+		ASSERT(0,"get_default_fft_length: fftLen > MAX_FFT_LENGTH_IN_K!");
 	return 0;
 }
 
@@ -2679,7 +2679,7 @@ uint32 get_nextlarger_fft_length(uint32 n)
 	if(get_fft_radices((n >> 10), 0, 0x0, 0x0, 0) != 0)
 	{
 		sprintf(cbuf, "get_nextlarger_fft_length: Illegal or Unsupported input FFT length %u\n", n);
-		ASSERT(HERE, 0, cbuf);
+		ASSERT(0, cbuf);
 	}
 
 	/* Extract leading 4 bits of input FFT lengths, thus decomposing it into the form {8,9,10,11,12,13,14,15}*2^m,
@@ -2688,7 +2688,7 @@ uint32 get_nextlarger_fft_length(uint32 n)
 	*/
 	rem2 = 32 - leadz32(n) - 4;
 	lead4 = n >> rem2;
-	ASSERT(HERE, lead4 > 7 && lead4 < 16,"get_nextlarger_fft_length: leading 4 bits of input FFT length out of range!");
+	ASSERT(lead4 > 7 && lead4 < 16,"get_nextlarger_fft_length: leading 4 bits of input FFT length out of range!");
 
 	/* Make sure next-larger FFT length is supported: */
 	++lead4;
diff --git a/src/get_preferred_fft_radix.c b/src/get_preferred_fft_radix.c
index 51d07b98..fa7a92c1 100755
--- a/src/get_preferred_fft_radix.c
+++ b/src/get_preferred_fft_radix.c
@@ -119,23 +119,23 @@ uint32	get_preferred_fft_radix(uint32 kblocks)
 					if(i == kblocks) {
 						if(found) {
 							sprintf(cbuf,"Multiple cfg-file entries for FFT length %uK encountered in %s - please delete or comment out all but one entry for this length, save the file and retry.",kblocks,CONFIGFILE);
-							ASSERT(HERE,0,cbuf);
+							ASSERT(0,cbuf);
 						} else
 							found = TRUE;
 					}
 					if(sscanf(char_addr + 11, "%lf", &tcurr) == 1) {	// 11 chars in "msec/iter ="
-						ASSERT(HERE, tcurr >= 0, "tcurr < 0!");
+						ASSERT(tcurr >= 0, "tcurr < 0!");
 						if((tbest == 0.0) || ((tcurr > 0.0) && (tcurr < tbest))) {
 							if((char_addr = strstr(in_line, "radices =")) == 0x0) {
-								snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: invalid format for %s file: 'radices =' not found in timing-data line %s", CONFIGFILE, in_line);
-								ASSERT(HERE, 0, cbuf);
+								snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: invalid format for %s file: 'radices =' not found in timing-data line %s", CONFIGFILE, in_line);
+								ASSERT(0, cbuf);
 							}
 							char_addr += 9;	// 9 chars in "radices ="
 							kprod = 1;	/* accumulate product of radices */
 							for(j = 0; j < 10; j++) {	/* Read in the radices */
 								if(sscanf(char_addr, "%d", &k) != 1) {
-									snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: invalid format for %s file: failed to read %dth element of radix set, offending input line %s", CONFIGFILE, j, in_line);
-									ASSERT(HERE, 0, cbuf);
+									snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: invalid format for %s file: failed to read %dth element of radix set, offending input line %s", CONFIGFILE, j, in_line);
+									ASSERT(0, cbuf);
 								} else {
 									// Advance to next WS char following the current numeric token - since sscanf skips leading WS,
 									// Must do this in 2 steps. NOTE we *need* the trailing ; here to serve as executable-statement
@@ -147,15 +147,15 @@ uint32	get_preferred_fft_radix(uint32 kblocks)
 									while( isspace(*char_addr)) char_addr++;	// 1. First skip any WS preceding current numeric token
 									while(!isspace(*char_addr)) char_addr++;	// 2. Look for first WS char following current numeric token
 									if(j == 0)
-										ASSERT(HERE, k <= 1024, "get_preferred_fft_radix: Leading radix > 1024: out of range!");
+										ASSERT(k <= 1024, "get_preferred_fft_radix: Leading radix > 1024: out of range!");
 									else if(k) {
-										ASSERT(HERE, k <= 32  , "get_preferred_fft_radix: Intermediate radix > 32: out of range!");
-										ASSERT(HERE, isPow2(k), "get_preferred_fft_radix: Intermediate FFT radix not a power of 2!");
+										ASSERT(k <= 32  , "get_preferred_fft_radix: Intermediate radix > 32: out of range!");
+										ASSERT(isPow2(k), "get_preferred_fft_radix: Intermediate FFT radix not a power of 2!");
 									}
 									/* If (i == kblocks), store the data directly into the NRADICES and RADIX_VEC[] globals: */
 									if(i == kblocks) {
 										if(k == 0) {
-											ASSERT(HERE, !NRADICES, "Zero terminator of radix set found but NRADICES != 0 ... please check your mlucas.cfg file for duplicate FFT-length entries and remove the unwanted ones, or delete the file and rerun the self-test.");
+											ASSERT(!NRADICES, "Zero terminator of radix set found but NRADICES != 0 ... please check your mlucas.cfg file for duplicate FFT-length entries and remove the unwanted ones, or delete the file and rerun the self-test.");
 											NRADICES = j;
 											break;
 										} else {
@@ -184,20 +184,20 @@ uint32	get_preferred_fft_radix(uint32 kblocks)
 							*/
 							kprod *= 2;
 							if((kprod & 1023) != 0) {
-								snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: illegal data in %s file: product of complex radices (%d) not a multiple of 1K! Offending input line %s", CONFIGFILE, kprod, in_line);
-								ASSERT(HERE, 0, cbuf);
+								snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: illegal data in %s file: product of complex radices (%d) not a multiple of 1K! Offending input line %s", CONFIGFILE, kprod, in_line);
+								ASSERT(0, cbuf);
 							}
 							kprod >>= 10;
 							tbest = tcurr;
 							if(i == kblocks) {
 								/* Product of radices must equal complex vector length (n/2): */
 								if(kprod != kblocks) {
-									snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: mismatching data in %s file: (product of complex radices)/2^10 (%d) != kblocks/2 (%d), offending input line %s", CONFIGFILE, kprod, kblocks/2, in_line);
-									ASSERT(HERE, 0, cbuf);
+									snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: mismatching data in %s file: (product of complex radices)/2^10 (%d) != kblocks/2 (%d), offending input line %s", CONFIGFILE, kprod, kblocks/2, in_line);
+									ASSERT(0, cbuf);
 								}
 								retval = i;			/* Preferred FFT length */
 							} else {
-								ASSERT(HERE, i == extractFFTlengthFrom32Bit(retval), "get_preferred_fft_radix: i != extractFFTlengthFrom32Bit(retval)!");
+								ASSERT(i == extractFFTlengthFrom32Bit(retval), "get_preferred_fft_radix: i != extractFFTlengthFrom32Bit(retval)!");
 							}
 						}
 					}
@@ -207,7 +207,7 @@ uint32	get_preferred_fft_radix(uint32 kblocks)
 		fclose(fp);	fp = 0x0;
 	} else {
 		sprintf(cbuf, "CONFIGFILE = %s: open failed -- please run the post-build self-tests as described in the README!", CONFIGFILE);
-		ASSERT(HERE, 0 , cbuf);
+		ASSERT(0 , cbuf);
 	}
 
 	/* Only return nonzero if an entry for the specified FFT length was found.
@@ -229,10 +229,10 @@ uint32	extractFFTlengthFrom32Bit (uint32 n)
 	uint32 i, nrad, retval;
 	/* Bits <0:9> store (leading radix-1): We subtract the 1 so radices up to 1024 can be stored: */
 	retval = (n & 0x3ff) + 1;	n >>= 10;
-	ASSERT(HERE, retval > 4, "extractFFTlengthFrom32Bit: Leading radix must be 5 or larger!");
+	ASSERT(retval > 4, "extractFFTlengthFrom32Bit: Leading radix must be 5 or larger!");
 	/* Bits <10:13> store (number of FFT radices): */
 	nrad   = (n & 0xf)    ;	n >>= 4;
-	ASSERT(HERE, nrad >=  3, "extractFFTlengthFrom32Bit: Number of radices must be 3 or larger!");
+	ASSERT(nrad >=  3, "extractFFTlengthFrom32Bit: Number of radices must be 3 or larger!");
 	/* Each successive pair of higher-order bits stores log2[(intermediate FFT radix)/8]: */
 	for(i = 1; i < nrad; i++)	/* Already done leading radix, so start at 1, not 0 */
 	{
@@ -248,12 +248,12 @@ void	extractFFTradicesFrom32Bit(uint32 n)
 	uint32 i, nrad, retval;
 	/* Bits <0:9> store (leading radix-1): We subtract the 1 so radices up to 1024 can be stored: */
 	retval = (n & 0x3ff) + 1;	n >>= 10;
-	ASSERT(HERE, retval > 4, "extractFFTradicesFrom32Bit: Leading radix must be 5 or larger!");
+	ASSERT(retval > 4, "extractFFTradicesFrom32Bit: Leading radix must be 5 or larger!");
 	RADIX_VEC[0] = retval;
 	/* Bits <10:13> store (number of FFT radices): */
 	nrad   = (n & 0xf)    ;	n >>= 4;
-	ASSERT(HERE, nrad >=  3, "extractFFTradicesFrom32Bit: Number of radices must be 3 or larger!");
-	ASSERT(HERE, nrad <= 10, "extractFFTradicesFrom32Bit: Number of radices must be 10 or smaller!");
+	ASSERT(nrad >=  3, "extractFFTradicesFrom32Bit: Number of radices must be 3 or larger!");
+	ASSERT(nrad <= 10, "extractFFTradicesFrom32Bit: Number of radices must be 10 or smaller!");
 	NRADICES = nrad;
 	/* Each successive pair of higher-order bits stores log2[(intermediate FFT radix)/8]: */
 	for(i = 1; i < 10; i++)	/* Already done leading radix, so start at 1, not 0 */
diff --git a/src/imul_macro.c b/src/imul_macro.c
index 702a99e2..7e9bc2ea 100755
--- a/src/imul_macro.c
+++ b/src/imul_macro.c
@@ -283,8 +283,8 @@ int test_mul()
 		#else
 			MUL_LOHI64(in64[i],in64[j], lo1, hi1);
 		#endif
-			ASSERT(HERE, lo1 == lo0, "test_mul() low-output mismatch!");
-			ASSERT(HERE, hi1 == hi0, "test_mul() hi -output mismatch!");
+			ASSERT(lo1 == lo0, "test_mul() low-output mismatch!");
+			ASSERT(hi1 == hi0, "test_mul() hi -output mismatch!");
 
 		/* Squaring is a special case: */
 		  if(i ==j)
@@ -294,8 +294,8 @@ int test_mul()
 		#else
 			SQR_LOHI64(in64[i], lo1, hi1);
 		#endif
-			ASSERT(HERE, lo1 == lo0, "test_mul() low-output mismatch!");
-			ASSERT(HERE, hi1 == hi0, "test_mul() hi -output mismatch!");
+			ASSERT(lo1 == lo0, "test_mul() low-output mismatch!");
+			ASSERT(hi1 == hi0, "test_mul() hi -output mismatch!");
 		  }
 		}
 	}
diff --git a/src/imul_macro0.h b/src/imul_macro0.h
index b0538c1e..cc7218c2 100755
--- a/src/imul_macro0.h
+++ b/src/imul_macro0.h
@@ -247,10 +247,10 @@ or the with functions using them (if we declare no _-prepended variables local t
 		hi2 = (uint64)bl2 | ((uint64)bh2 << 32);\
 		hi3 = (uint64)bl3 | ((uint64)bh3 << 32);\
 	\
-		/*lo0 = (uint64)al0 + ((uint64)ah0 << 32);	SQR_LOHI64(x0,&a,&b);	if(a != lo0) printf("x,a,lo = %20llu %20llu %20llu\n",x0,a,lo0);	if(b != hi0) printf("x,b,hi = %20llu %20llu %20llu\n",x0,b,hi0);*/\
-		/*lo1 = (uint64)al1 + ((uint64)ah1 << 32);	SQR_LOHI64(x1,&a,&b);	if(a != lo1) printf("x,a,lo = %20llu %20llu %20llu\n",x1,a,lo1);	if(b != hi1) printf("x,b,hi = %20llu %20llu %20llu\n",x1,b,hi1);*/\
-		/*lo2 = (uint64)al2 + ((uint64)ah2 << 32);	SQR_LOHI64(x2,&a,&b);	if(a != lo2) printf("x,a,lo = %20llu %20llu %20llu\n",x2,a,lo2);	if(b != hi2) printf("x,b,hi = %20llu %20llu %20llu\n",x2,b,hi2);*/\
-		/*lo3 = (uint64)al3 + ((uint64)ah3 << 32);	SQR_LOHI64(x3,&a,&b);	if(a != lo3) printf("x,a,lo = %20llu %20llu %20llu\n",x3,a,lo3);	if(b != hi3) printf("x,b,hi = %20llu %20llu %20llu\n",x3,b,hi3);*/\
+		/*lo0 = (uint64)al0 + ((uint64)ah0 << 32);	SQR_LOHI64(x0,&a,&b);	if(a != lo0) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0);	if(b != hi0) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,b,hi0);*/\
+		/*lo1 = (uint64)al1 + ((uint64)ah1 << 32);	SQR_LOHI64(x1,&a,&b);	if(a != lo1) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1);	if(b != hi1) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,b,hi1);*/\
+		/*lo2 = (uint64)al2 + ((uint64)ah2 << 32);	SQR_LOHI64(x2,&a,&b);	if(a != lo2) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2);	if(b != hi2) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,b,hi2);*/\
+		/*lo3 = (uint64)al3 + ((uint64)ah3 << 32);	SQR_LOHI64(x3,&a,&b);	if(a != lo3) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3);	if(b != hi3) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,b,hi3);*/\
 	\
 		/* loj = MULL64(loj,qinvj) : */\
 	\
@@ -281,10 +281,10 @@ or the with functions using them (if we declare no _-prepended variables local t
 		y2 = (uint64)bl2 | ((uint64)bh2 << 32);\
 		y3 = (uint64)bl3 | ((uint64)bh3 << 32);\
 	\
-	/*a = MULH64(lo0,q0);	if(a != y0) printf("lo,q,hi = %20llu %20llu %20llu\n",x0,a,lo0);*/\
-	/*a = MULH64(lo1,q1);	if(a != y1) printf("lo,q,hi = %20llu %20llu %20llu\n",x1,a,lo1);*/\
-	/*a = MULH64(lo2,q2);	if(a != y2) printf("lo,q,hi = %20llu %20llu %20llu\n",x2,a,lo2);*/\
-	/*a = MULH64(lo3,q3);	if(a != y3) printf("lo,q,hi = %20llu %20llu %20llu\n",x3,a,lo3);*/\
+	/*a = MULH64(lo0,q0);	if(a != y0) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0);*/\
+	/*a = MULH64(lo1,q1);	if(a != y1) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1);*/\
+	/*a = MULH64(lo2,q2);	if(a != y2) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2);*/\
+	/*a = MULH64(lo3,q3);	if(a != y3) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3);*/\
 	}
 
 /********************************************************************************/
@@ -1225,14 +1225,14 @@ or the with functions using them (if we declare no _-prepended variables local t
 			qil2 = (uint32) qinv2;\
 			qil3 = (uint32) qinv3;\
 			\
-			DBG_ASSERT(HERE, (ql0  >> 32) == 0,"MOD_INI_Q4: (ql0  >> 32) == 0");\
-			DBG_ASSERT(HERE, (ql1  >> 32) == 0,"MOD_INI_Q4: (ql1  >> 32) == 0");\
-			DBG_ASSERT(HERE, (ql2  >> 32) == 0,"MOD_INI_Q4: (ql2  >> 32) == 0");\
-			DBG_ASSERT(HERE, (ql3  >> 32) == 0,"MOD_INI_Q4: (ql3  >> 32) == 0");\
-			DBG_ASSERT(HERE, (qil0 >> 32) == 0,"MOD_INI_Q4: (qil0 >> 32) == 0");\
-			DBG_ASSERT(HERE, (qil1 >> 32) == 0,"MOD_INI_Q4: (qil1 >> 32) == 0");\
-			DBG_ASSERT(HERE, (qil2 >> 32) == 0,"MOD_INI_Q4: (qil2 >> 32) == 0");\
-			DBG_ASSERT(HERE, (qil3 >> 32) == 0,"MOD_INI_Q4: (qil3 >> 32) == 0");\
+			DBG_ASSERT((ql0  >> 32) == 0,"MOD_INI_Q4: (ql0  >> 32) == 0");\
+			DBG_ASSERT((ql1  >> 32) == 0,"MOD_INI_Q4: (ql1  >> 32) == 0");\
+			DBG_ASSERT((ql2  >> 32) == 0,"MOD_INI_Q4: (ql2  >> 32) == 0");\
+			DBG_ASSERT((ql3  >> 32) == 0,"MOD_INI_Q4: (ql3  >> 32) == 0");\
+			DBG_ASSERT((qil0 >> 32) == 0,"MOD_INI_Q4: (qil0 >> 32) == 0");\
+			DBG_ASSERT((qil1 >> 32) == 0,"MOD_INI_Q4: (qil1 >> 32) == 0");\
+			DBG_ASSERT((qil2 >> 32) == 0,"MOD_INI_Q4: (qil2 >> 32) == 0");\
+			DBG_ASSERT((qil3 >> 32) == 0,"MOD_INI_Q4: (qil3 >> 32) == 0");\
 			\
 			qh0  = (uint32)(q0    >> 32);\
 			qh1  = (uint32)(q1    >> 32);\
@@ -1243,14 +1243,14 @@ or the with functions using them (if we declare no _-prepended variables local t
 			qih2 = (uint32)(qinv2 >> 32);\
 			qih3 = (uint32)(qinv3 >> 32);\
 			\
-			DBG_ASSERT(HERE, (qh0  >> 32) == 0,"MOD_INI_Q4: (qh0  >> 32) == 0");\
-			DBG_ASSERT(HERE, (qh1  >> 32) == 0,"MOD_INI_Q4: (qh1  >> 32) == 0");\
-			DBG_ASSERT(HERE, (qh2  >> 32) == 0,"MOD_INI_Q4: (qh2  >> 32) == 0");\
-			DBG_ASSERT(HERE, (qh3  >> 32) == 0,"MOD_INI_Q4: (qh3  >> 32) == 0");\
-			DBG_ASSERT(HERE, (qih0 >> 32) == 0,"MOD_INI_Q4: (qih0 >> 32) == 0");\
-			DBG_ASSERT(HERE, (qih1 >> 32) == 0,"MOD_INI_Q4: (qih1 >> 32) == 0");\
-			DBG_ASSERT(HERE, (qih2 >> 32) == 0,"MOD_INI_Q4: (qih2 >> 32) == 0");\
-			DBG_ASSERT(HERE, (qih3 >> 32) == 0,"MOD_INI_Q4: (qih3 >> 32) == 0");\
+			DBG_ASSERT((qh0  >> 32) == 0,"MOD_INI_Q4: (qh0  >> 32) == 0");\
+			DBG_ASSERT((qh1  >> 32) == 0,"MOD_INI_Q4: (qh1  >> 32) == 0");\
+			DBG_ASSERT((qh2  >> 32) == 0,"MOD_INI_Q4: (qh2  >> 32) == 0");\
+			DBG_ASSERT((qh3  >> 32) == 0,"MOD_INI_Q4: (qh3  >> 32) == 0");\
+			DBG_ASSERT((qih0 >> 32) == 0,"MOD_INI_Q4: (qih0 >> 32) == 0");\
+			DBG_ASSERT((qih1 >> 32) == 0,"MOD_INI_Q4: (qih1 >> 32) == 0");\
+			DBG_ASSERT((qih2 >> 32) == 0,"MOD_INI_Q4: (qih2 >> 32) == 0");\
+			DBG_ASSERT((qih3 >> 32) == 0,"MOD_INI_Q4: (qih3 >> 32) == 0");\
 		}
 
 		/* For each input xj, calculates the following sequence:
@@ -1292,10 +1292,10 @@ or the with functions using them (if we declare no _-prepended variables local t
 			hi2 = (uint64)bl2 | ((uint64)bh2 << 32);\
 			hi3 = (uint64)bl3 | ((uint64)bh3 << 32);\
 		/* DEBUG:\
-		lo0 = (uint64)al0 + ((uint64)ah0 << 32);	SQR_LOHI64(x0,&a,&b);	if(a != lo0) printf("x,a,lo = %20llu %20llu %20llu\n",x0,a,lo0);	if(b != hi0) printf("x,b,hi = %20llu %20llu %20llu\n",x0,b,hi0);	\
-		lo1 = (uint64)al1 + ((uint64)ah1 << 32);	SQR_LOHI64(x1,&a,&b);	if(a != lo1) printf("x,a,lo = %20llu %20llu %20llu\n",x1,a,lo1);	if(b != hi1) printf("x,b,hi = %20llu %20llu %20llu\n",x1,b,hi1);	\
-		lo2 = (uint64)al2 + ((uint64)ah2 << 32);	SQR_LOHI64(x2,&a,&b);	if(a != lo2) printf("x,a,lo = %20llu %20llu %20llu\n",x2,a,lo2);	if(b != hi2) printf("x,b,hi = %20llu %20llu %20llu\n",x2,b,hi2);	\
-		lo3 = (uint64)al3 + ((uint64)ah3 << 32);	SQR_LOHI64(x3,&a,&b);	if(a != lo3) printf("x,a,lo = %20llu %20llu %20llu\n",x3,a,lo3);	if(b != hi3) printf("x,b,hi = %20llu %20llu %20llu\n",x3,b,hi3);	\
+		lo0 = (uint64)al0 + ((uint64)ah0 << 32);	SQR_LOHI64(x0,&a,&b);	if(a != lo0) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0);	if(b != hi0) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,b,hi0);	\
+		lo1 = (uint64)al1 + ((uint64)ah1 << 32);	SQR_LOHI64(x1,&a,&b);	if(a != lo1) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1);	if(b != hi1) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,b,hi1);	\
+		lo2 = (uint64)al2 + ((uint64)ah2 << 32);	SQR_LOHI64(x2,&a,&b);	if(a != lo2) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2);	if(b != hi2) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,b,hi2);	\
+		lo3 = (uint64)al3 + ((uint64)ah3 << 32);	SQR_LOHI64(x3,&a,&b);	if(a != lo3) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3);	if(b != hi3) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,b,hi3);	\
 		*/\
 			/* loj = MULL64(loj,qinvj) : */\
 		/* DEBUG:\
@@ -1326,10 +1326,10 @@ or the with functions using them (if we declare no _-prepended variables local t
 			y2 = (uint64)bl2 | ((uint64)bh2 << 32);\
 			y3 = (uint64)bl3 | ((uint64)bh3 << 32);\
 		/* DEBUG:\
-			a = MULH64(lo0,q0);	if(a != y0) printf("lo,q,hi = %20llu %20llu %20llu\n",x0,a,lo0);	\
-			a = MULH64(lo1,q1);	if(a != y1) printf("lo,q,hi = %20llu %20llu %20llu\n",x1,a,lo1);	\
-			a = MULH64(lo2,q2);	if(a != y2) printf("lo,q,hi = %20llu %20llu %20llu\n",x2,a,lo2);	\
-			a = MULH64(lo3,q3);	if(a != y3) printf("lo,q,hi = %20llu %20llu %20llu\n",x3,a,lo3);	\
+			a = MULH64(lo0,q0);	if(a != y0) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0);	\
+			a = MULH64(lo1,q1);	if(a != y1) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1);	\
+			a = MULH64(lo2,q2);	if(a != y2) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2);	\
+			a = MULH64(lo3,q3);	if(a != y3) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3);	\
 		*/\
 		}
 
@@ -1375,7 +1375,7 @@ or the with functions using them (if we declare no _-prepended variables local t
 		char s0[21],s1[21];\
 		uint64 _t,_a,_b;\
 		\
-		ASSERT(HERE, ((uint64)(_y) >> 32) == 0,"MUL64x32: ((_y) >> 32) == 0");\
+		ASSERT(((uint64)(_y) >> 32) == 0,"MUL64x32: ((_y) >> 32) == 0");\
 		MUL_LOHI64((_x), (uint64)(_y), _a, _b);\
 		\
 		_lo = ((uint32)((_x) & 0x00000000ffffffff)) * (_y);	/* a*c */\
@@ -1390,7 +1390,7 @@ or the with functions using them (if we declare no _-prepended variables local t
 			printf("x = %s, y = %s\n", s0[convert_uint64_base10_char(s0,_x )], s1[convert_uint64_base10_char(s1,_y)]);\
 			printf("LO= %s, A = %s\n", s0[convert_uint64_base10_char(s0,_lo)], s1[convert_uint64_base10_char(s1,_a)]);\
 			printf("HI= %s, B = %s\n", s0[convert_uint64_base10_char(s0,_hi)], s1[convert_uint64_base10_char(s0,_b)]);\
-			ASSERT(HERE, 0,"0");\
+			ASSERT(0,"0");\
 		}\
 	}
    #else
diff --git a/src/imul_macro1.h b/src/imul_macro1.h
index f1dc277f..d42bdd41 100755
--- a/src/imul_macro1.h
+++ b/src/imul_macro1.h
@@ -87,8 +87,8 @@ that is advantageous on at least an appreciable set of CPUs.
 
 #define ADD160(__x, __y, __sum)\
 {\
-	DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"ADD160: (__x.d2 >> 32) == 0");\
-	DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"ADD160: (__y.d2 >> 32) == 0");\
+	DBG_ASSERT((__x.d2 >> 32) == 0,"ADD160: (__x.d2 >> 32) == 0");\
+	DBG_ASSERT((__y.d2 >> 32) == 0,"ADD160: (__y.d2 >> 32) == 0");\
 	ADD192(__x, __y, __sum);\
 	__sum.d2 &= 0x00000000ffffffff;	/* In case of add need to take care to get proper mod-2^160 result */\
 }
@@ -192,8 +192,8 @@ that is advantageous on at least an appreciable set of CPUs.
 
 #define SUB160(__x, __y, __dif)\
 {\
-	DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"SUB160: (__x.d2 >> 32) == 0");\
-	DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"SUB160: (__y.d2 >> 32) == 0");\
+	DBG_ASSERT((__x.d2 >> 32) == 0,"SUB160: (__x.d2 >> 32) == 0");\
+	DBG_ASSERT((__y.d2 >> 32) == 0,"SUB160: (__y.d2 >> 32) == 0");\
 	SUB192(__x, __y, __dif);\
 	__dif.d2 &= 0x00000000ffffffff;	/* In case of add need to take care to get proper mod-2^160 result */\
 }
@@ -301,7 +301,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 /* Left-shifts: */
 #define LSHIFT128(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT128: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"LSHIFT128: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -326,7 +326,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 }
 #define LSHIFT96(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT96: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"LSHIFT96: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -357,7 +357,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 
 #define LSHIFT192(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT192: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"LSHIFT192: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -398,14 +398,14 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 }
 #define LSHIFT160(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"LSHIFT160: ((uint64)__x.d2 >> 32) == 0");\
+	DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"LSHIFT160: ((uint64)__x.d2 >> 32) == 0");\
 	LSHIFT192(__x,__n, __y);\
 	__y.d2 &= 0x00000000ffffffff;\
 }
 
 #define LSHIFT256(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT256: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"LSHIFT256: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -469,7 +469,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 /* (Logical) Right-shifts: */
 #define RSHIFT128(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT128: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"RSHIFT128: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -494,7 +494,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 }
 #define RSHIFT96(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT96: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"RSHIFT96: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -530,7 +530,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 #define RSHIFT192(__x, __n, __y)\
 {\
 	int __lsh,__rsh;\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT192: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"RSHIFT192: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -576,14 +576,14 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 }
 #define RSHIFT160(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"RSHIFT160: ((uint64)__x.d2 >> 32) == 0");\
+	DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"RSHIFT160: ((uint64)__x.d2 >> 32) == 0");\
 	RSHIFT192(__x,__n, __y);\
-	DBG_ASSERT(HERE, (uint64)__y.d2 <= (uint64)__x.d2,"RSHIFT160: (uint64)__y.d2 <= (uint64)__x.d2");\
+	DBG_ASSERT((uint64)__y.d2 <= (uint64)__x.d2,"RSHIFT160: (uint64)__y.d2 <= (uint64)__x.d2");\
 }
 
 #define RSHIFT256(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT256: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"RSHIFT256: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -647,38 +647,38 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 /* Left-shifts: */
 #define LSHIFT_FAST128(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (uint64)__n != 0,"LSHIFT_FAST128: (uint64)__n != 0");\
-	DBG_ASSERT(HERE, (uint64)__n < 64,"LSHIFT_FAST128: (uint64)__n < 64");\
+	DBG_ASSERT((uint64)__n != 0,"LSHIFT_FAST128: (uint64)__n != 0");\
+	DBG_ASSERT((uint64)__n < 64,"LSHIFT_FAST128: (uint64)__n < 64");\
 	__y.d1 = ((uint64)__x.d1 << __n) + ((uint64)__x.d0 >> (64-__n));\
 	__y.d0 = ((uint64)__x.d0 << __n);\
 }
 #define LSHIFT_FAST96(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >  0,"LSHIFT96: (int64)__n >  0");\
-	DBG_ASSERT(HERE, (int64)__n < 32,"LSHIFT96: (int64)__n < 32");\
+	DBG_ASSERT((int64)__n >  0,"LSHIFT96: (int64)__n >  0");\
+	DBG_ASSERT((int64)__n < 32,"LSHIFT96: (int64)__n < 32");\
 	__y.d1 = ((uint32)__x.d1 << __n) + (uint32)((uint64)__x.d0 >> (64-__n));\
 	__y.d0 = ((uint64)__x.d0 << __n);\
 }
 
 #define LSHIFT_FAST192(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (uint64)__n != 0,"LSHIFT_FAST192: (uint64)__n != 0");\
-	DBG_ASSERT(HERE, (uint64)__n < 64,"LSHIFT_FAST192: (uint64)__n < 64");\
+	DBG_ASSERT((uint64)__n != 0,"LSHIFT_FAST192: (uint64)__n != 0");\
+	DBG_ASSERT((uint64)__n < 64,"LSHIFT_FAST192: (uint64)__n < 64");\
 	__y.d2 = ((uint64)__x.d2 << __n) + ((uint64)__x.d1 >> (64-__n));\
 	__y.d1 = ((uint64)__x.d1 << __n) + ((uint64)__x.d0 >> (64-__n));\
 	__y.d0 = ((uint64)__x.d0 << __n);\
 }
 #define LSHIFT_FAST160(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"LSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\
+	DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"LSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\
 	LSHIFT_FAST192(__x,__n, __y);\
 	__y.d2 &= 0x00000000ffffffff;\
 }
 
 #define LSHIFT_FAST256(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (uint64)__n != 0,"LSHIFT_FAST256: (uint64)__n != 0");\
-	DBG_ASSERT(HERE, (uint64)__n < 64,"LSHIFT_FAST256: (uint64)__n < 64");\
+	DBG_ASSERT((uint64)__n != 0,"LSHIFT_FAST256: (uint64)__n != 0");\
+	DBG_ASSERT((uint64)__n < 64,"LSHIFT_FAST256: (uint64)__n < 64");\
 	__y.d3 = ((uint64)__x.d3 << __n) + ((uint64)__x.d2 >> (64-__n));\
 	__y.d2 = ((uint64)__x.d2 << __n) + ((uint64)__x.d1 >> (64-__n));\
 	__y.d1 = ((uint64)__x.d1 << __n) + ((uint64)__x.d0 >> (64-__n));\
@@ -688,38 +688,38 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned.
 /* (Logical) Right-shifts: */
 #define RSHIFT_FAST128(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (uint64)__n != 0,"RSHIFT_FAST128: (uint64)__n != 0");\
-	DBG_ASSERT(HERE, (uint64)__n < 64,"RSHIFT_FAST128: (uint64)__n < 64");\
+	DBG_ASSERT((uint64)__n != 0,"RSHIFT_FAST128: (uint64)__n != 0");\
+	DBG_ASSERT((uint64)__n < 64,"RSHIFT_FAST128: (uint64)__n < 64");\
 	__y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\
 	__y.d1 = ((uint64)__x.d1 >> __n);\
 }
 #define RSHIFT_FAST96(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >  0,"RSHIFT96: (int64)__n >  0");\
-	DBG_ASSERT(HERE, (int64)__n < 32,"RSHIFT96: (int64)__n < 32");\
+	DBG_ASSERT((int64)__n >  0,"RSHIFT96: (int64)__n >  0");\
+	DBG_ASSERT((int64)__n < 32,"RSHIFT96: (int64)__n < 32");\
 	__y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\
 	__y.d1 = ((uint32)__x.d1 >> __n);\
 }
 
 #define RSHIFT_FAST192(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (uint64)__n != 0,"RSHIFT_FAST192: (uint64)__n != 0");\
-	DBG_ASSERT(HERE, (uint64)__n < 64,"RSHIFT_FAST192: (uint64)__n < 64");\
+	DBG_ASSERT((uint64)__n != 0,"RSHIFT_FAST192: (uint64)__n != 0");\
+	DBG_ASSERT((uint64)__n < 64,"RSHIFT_FAST192: (uint64)__n < 64");\
 	__y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\
 	__y.d1 = ((uint64)__x.d1 >> __n) + ((uint64)__x.d2 << (64-__n));\
 	__y.d2 = ((uint64)__x.d2 >> __n);\
 }
 #define RSHIFT_FAST160(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"RSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\
+	DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"RSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\
 	RSHIFT_FAST192(__x,__n, __y);\
 	__y.d2 &= 0x00000000ffffffff;\
 }
 
 #define RSHIFT_FAST256(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (uint64)__n != 0,"RSHIFT_FAST256: (uint64)__n != 0");\
-	DBG_ASSERT(HERE, (uint64)__n < 64,"RSHIFT_FAST256: (uint64)__n < 64");\
+	DBG_ASSERT((uint64)__n != 0,"RSHIFT_FAST256: (uint64)__n != 0");\
+	DBG_ASSERT((uint64)__n < 64,"RSHIFT_FAST256: (uint64)__n < 64");\
 	__y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\
 	__y.d1 = ((uint64)__x.d1 >> __n) + ((uint64)__x.d2 << (64-__n));\
 	__y.d2 = ((uint64)__x.d2 >> __n) + ((uint64)__x.d3 << (64-__n));\
@@ -751,7 +751,7 @@ Cast the result of the high-part-equals-zero test to a signed 32-bit (-1) becaus
 
 #define LSHIFT96_PTR(__x, __n, __y)\
 {\
-	DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT96_PTR: (int64)__n >= 0");\
+	DBG_ASSERT((int64)__n >= 0,"LSHIFT96_PTR: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -896,7 +896,7 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
     {\
 		uint64 __l,__m,__h,__a,__b,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\
 		__t   = (uint64)(__x.d1);\
 		__h   = __t*__t;\
 		SQR_LOHI64(__x.d0,    &__l,&__m);\
@@ -914,7 +914,7 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
     {\
 		uint64 __l,__m,__h,__a,__b,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 31) == 0,"SQR_LOHI95: (__x.d1 >> 31) == 0");\
+		DBG_ASSERT((__x.d1 >> 31) == 0,"SQR_LOHI95: (__x.d1 >> 31) == 0");\
 		__t   = (uint64)(__x.d1);\
 		__h   = __t*__t;\
 		SQR_LOHI64(__x.d0,         &__l,&__m);\
@@ -998,7 +998,7 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
     {\
 		uint64 __l,__m,__h,__a,__b;\
 		uint32 __tt = __x.d1, __hl32,__hh32;\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\
 		MUL64x32(__x.d0,__tt, __a, __b);\
 		SQR_LOHI64(__x.d0,     __l, __m);\
 		MUL_LOHI32(__tt,__tt,__hl32,__hh32);\
@@ -1025,10 +1025,10 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 		uint64	__a0,__a1,__a2,__a3,\
 				__b0,__b1,__b2,__b3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\
 		\
 		SQR_LOHI    (__x0.d0,                             __a0 ,     __b0 );\
 		SQR_LOHI    (__x1.d0,                             __a1 ,     __b1 );\
@@ -1091,10 +1091,10 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 				__b0,__b1,__b2,__b3,\
 				__s0,__s1,__s2,__s3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -1229,10 +1229,10 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 				__b0,__b1,__b2,__b3,\
 				__s0,__s1,__s2,__s3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x0.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x1.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x2.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x3.d1 >> 31) == 0");\
+		DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x0.d1 >> 31) == 0");\
+		DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x1.d1 >> 31) == 0");\
+		DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x2.d1 >> 31) == 0");\
+		DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x3.d1 >> 31) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -1296,14 +1296,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 		uint64	__a0,__a1,__a2,__a3,__a4,__a5,__a6,__a7,\
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI    (__x0.d0,                             __a0 ,     __b0 );\
 		SQR_LOHI    (__x1.d0,                             __a1 ,     __b1 );\
@@ -1406,14 +1406,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\
 				__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -1529,14 +1529,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\
 				__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,              __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,              __a1 ,     __b1 );\
@@ -1646,14 +1646,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops.
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\
 				__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x0.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x1.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x2.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x3.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x4.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x5.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x6.d1 >> 31) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x7.d1 >> 31) == 0");\
+		DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x0.d1 >> 31) == 0");\
+		DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x1.d1 >> 31) == 0");\
+		DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x2.d1 >> 31) == 0");\
+		DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x3.d1 >> 31) == 0");\
+		DBG_ASSERT((__x4.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x4.d1 >> 31) == 0");\
+		DBG_ASSERT((__x5.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x5.d1 >> 31) == 0");\
+		DBG_ASSERT((__x6.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x6.d1 >> 31) == 0");\
+		DBG_ASSERT((__x7.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x7.d1 >> 31) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -1743,8 +1743,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
     {\
 		uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
 		__s   = (uint64)(__x.d1);\
 		__t   = (uint64)(__y.d1);\
 		__h   = __s*__t;\
@@ -1766,8 +1766,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
     {\
 		uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
 		__s   = (uint64)(__x.d1);\
 		__t   = (uint64)(__y.d1);\
 		__h   = __s*__t;\
@@ -1795,8 +1795,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
     {\
 		uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
 		__s   = (uint64)(__x.d1);\
 		__t   = (uint64)(__y.d1);\
 		__h   = __s*__t;\
@@ -1817,8 +1817,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
     {\
 		uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\
 		__s   = (uint64)(__x.d1);\
 		__t   = (uint64)(__y.d1);\
 		__h   = __s*__t;\
@@ -1847,8 +1847,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
     {\
 		uint64 __l,__m;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\
 		MUL_LOHI64(__x.d0,__y.d0,&__l,&__m);\
 		__m += __MULL32(__x.d1,__y.d0) + __MULL32(__y.d1,__x.d0);	/* Only need the bottom 32 bits of each product here */\
 		__lo.d0 =  __l;	__lo.d1 = __m & 0x00000000ffffffff;\
@@ -1894,8 +1894,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
     {\
 		uint64 __l,__m;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\
 		MUL_LOHI64(__x.d0,__y.d0, __l, __m);\
 		__m += __MULL32(__x.d1,__y.d0) + __MULL32(__y.d1,__x.d0);	/* Only need the bottom 32 bits of each product here */\
 		__lo.d0 =  __l;	__lo.d1 = __m & 0x00000000ffffffff;\
@@ -1913,15 +1913,15 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
 		uint64	__a0,__a1,__a2,__a3,\
 				__b0,__b1,__b2,__b3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULL96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULL96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULL96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULL96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"MULL96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"MULL96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"MULL96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"MULL96_q4: (__x3.d1 >> 32) == 0");\
 		\
-		DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULL96_q4: (__y0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULL96_q4: (__y1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULL96_q4: (__y2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULL96_q4: (__y3.d1 >> 32) == 0");\
+		DBG_ASSERT((__y0.d1 >> 32) == 0,"MULL96_q4: (__y0.d1 >> 32) == 0");\
+		DBG_ASSERT((__y1.d1 >> 32) == 0,"MULL96_q4: (__y1.d1 >> 32) == 0");\
+		DBG_ASSERT((__y2.d1 >> 32) == 0,"MULL96_q4: (__y2.d1 >> 32) == 0");\
+		DBG_ASSERT((__y3.d1 >> 32) == 0,"MULL96_q4: (__y3.d1 >> 32) == 0");\
 		\
 		MUL_LOHI64(__x0.d0,__y0.d0, __a0, __b0);\
 		MUL_LOHI64(__x1.d0,__y1.d0, __a1, __b1);\
@@ -1955,23 +1955,23 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op.
 		uint64	__a0,__a1,__a2,__a3,__a4,__a5,__a6,__a7,\
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULL96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULL96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULL96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULL96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"MULL96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"MULL96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"MULL96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"MULL96_q8: (__x7.d1 >> 32) == 0");\
-		\
-		DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULL96_q8: (__y0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULL96_q8: (__y1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULL96_q8: (__y2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULL96_q8: (__y3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y4.d1 >> 32) == 0,"MULL96_q8: (__y4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y5.d1 >> 32) == 0,"MULL96_q8: (__y5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y6.d1 >> 32) == 0,"MULL96_q8: (__y6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y7.d1 >> 32) == 0,"MULL96_q8: (__y7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"MULL96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"MULL96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"MULL96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"MULL96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"MULL96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"MULL96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"MULL96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"MULL96_q8: (__x7.d1 >> 32) == 0");\
+		\
+		DBG_ASSERT((__y0.d1 >> 32) == 0,"MULL96_q8: (__y0.d1 >> 32) == 0");\
+		DBG_ASSERT((__y1.d1 >> 32) == 0,"MULL96_q8: (__y1.d1 >> 32) == 0");\
+		DBG_ASSERT((__y2.d1 >> 32) == 0,"MULL96_q8: (__y2.d1 >> 32) == 0");\
+		DBG_ASSERT((__y3.d1 >> 32) == 0,"MULL96_q8: (__y3.d1 >> 32) == 0");\
+		DBG_ASSERT((__y4.d1 >> 32) == 0,"MULL96_q8: (__y4.d1 >> 32) == 0");\
+		DBG_ASSERT((__y5.d1 >> 32) == 0,"MULL96_q8: (__y5.d1 >> 32) == 0");\
+		DBG_ASSERT((__y6.d1 >> 32) == 0,"MULL96_q8: (__y6.d1 >> 32) == 0");\
+		DBG_ASSERT((__y7.d1 >> 32) == 0,"MULL96_q8: (__y7.d1 >> 32) == 0");\
 		\
 		MUL_LOHI64(__x0.d0,__y0.d0, __a0, __b0);\
 		MUL_LOHI64(__x1.d0,__y1.d0, __a1, __b1);\
@@ -2018,8 +2018,8 @@ neglect of the lower bits, but that seems well below the likely level of hardwar
     {\
 		uint64 __m,__h,__a,__b,__c,__d,__s,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\
 		__s   = (uint64)(__x.d1);\
 		__t   = (uint64)(__y.d1);\
 		__h   = __s*__t;\
@@ -2095,8 +2095,8 @@ to get the 16x64==>80-bit intermediate products.
     {\
 		uint64 __a,__b,__xlo,__ylo,__xhi,__yhi,__lo;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 16) == 0,"MULH96_80 : (__x.d1 >> 16) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 16) == 0,"MULH96_80 : (__y.d1 >> 16) == 0");\
+		DBG_ASSERT((__x.d1 >> 16) == 0,"MULH96_80 : (__x.d1 >> 16) == 0");\
+		DBG_ASSERT((__y.d1 >> 16) == 0,"MULH96_80 : (__y.d1 >> 16) == 0");\
 		__xhi =(__x.d1 << 48) + (__x.d0 >> 16);\
 		__yhi =(__y.d1 << 48) + (__y.d0 >> 16);\
 		__xlo = __x.d0 << 48;	/* xlo << 48 */\
@@ -2116,8 +2116,8 @@ to get the 16x64==>80-bit intermediate products.
     {\
 		uint64 __m,__h,__aa,__bb,__cc,__dd,__s,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\
 		__s   = (uint64)(__x.d1);\
 		__t   = (uint64)(__y.d1);\
 		__h   = __s*__t;\
@@ -2150,15 +2150,15 @@ to get the 16x64==>80-bit intermediate products.
 		uint64 __a2,__b2,__c2,__d2,__m2,__h2;\
 		uint64 __a3,__b3,__c3,__d3,__m3,__h3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULH96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULH96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULH96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULH96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"MULH96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"MULH96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"MULH96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"MULH96_q4: (__x3.d1 >> 32) == 0");\
 		\
-		DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH96_q4: (__y0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH96_q4: (__y1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH96_q4: (__y2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH96_q4: (__y3.d1 >> 32) == 0");\
+		DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH96_q4: (__y0.d1 >> 32) == 0");\
+		DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH96_q4: (__y1.d1 >> 32) == 0");\
+		DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH96_q4: (__y2.d1 >> 32) == 0");\
+		DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH96_q4: (__y3.d1 >> 32) == 0");\
 		\
 		MUL_LOHI32(__x0.d1, __y0.d1, __l32_0, __h32_0);\
 		MUL_LOHI32(__x1.d1, __y1.d1, __l32_1, __h32_1);\
@@ -2234,23 +2234,23 @@ to get the 16x64==>80-bit intermediate products.
 		uint64 __a6,__b6,__c6,__d6,__m6,__h6;\
 		uint64 __a7,__b7,__c7,__d7,__m7,__h7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULH96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULH96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULH96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULH96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"MULH96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"MULH96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"MULH96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"MULH96_q8: (__x7.d1 >> 32) == 0");\
-		\
-		DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH96_q8: (__y0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH96_q8: (__y1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH96_q8: (__y2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH96_q8: (__y3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y4.d1 >> 32) == 0,"MULH96_q8: (__y4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y5.d1 >> 32) == 0,"MULH96_q8: (__y5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y6.d1 >> 32) == 0,"MULH96_q8: (__y6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y7.d1 >> 32) == 0,"MULH96_q8: (__y7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"MULH96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"MULH96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"MULH96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"MULH96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"MULH96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"MULH96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"MULH96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"MULH96_q8: (__x7.d1 >> 32) == 0");\
+		\
+		DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH96_q8: (__y0.d1 >> 32) == 0");\
+		DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH96_q8: (__y1.d1 >> 32) == 0");\
+		DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH96_q8: (__y2.d1 >> 32) == 0");\
+		DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH96_q8: (__y3.d1 >> 32) == 0");\
+		DBG_ASSERT((__y4.d1 >> 32) == 0,"MULH96_q8: (__y4.d1 >> 32) == 0");\
+		DBG_ASSERT((__y5.d1 >> 32) == 0,"MULH96_q8: (__y5.d1 >> 32) == 0");\
+		DBG_ASSERT((__y6.d1 >> 32) == 0,"MULH96_q8: (__y6.d1 >> 32) == 0");\
+		DBG_ASSERT((__y7.d1 >> 32) == 0,"MULH96_q8: (__y7.d1 >> 32) == 0");\
 		\
 		__h0 = (uint64)__x0.d1*(uint64)__y0.d1;\
 		__h1 = (uint64)__x1.d1*(uint64)__y1.d1;\
@@ -2973,7 +2973,7 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
     {\
 		uint64 __w0,__w1,__w2,__a,__b;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\
 		SQR_LOHI64(__x.d0,            &__w0,&__w1);\
 		/* Need to add 2*a*b, so simply double b (which has at most 32 bits) prior to the MUL_LOHI: */\
 		MUL_LOHI64(__x.d0,__x.d1 << 1,&__a ,&__b );\
@@ -3061,7 +3061,7 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
     {\
 		uint64 __w0,__w1,__w2,__a,__b;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\
 		SQR_LOHI64(__x.d0, __w0, __w1);\
 		/* Need to add 2*a*b, so simply double b (which has at most 32 bits) prior to the MUL_LOHI: */\
 		MUL_LOHI64_ADD(__x.d0, __x.d1 << 1, __w1, __a , __b );\
@@ -3073,7 +3073,7 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
     {\
 		uint64 __w0,__w1,__w2,__a,__b;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\
 		SQR_LOHI64(__x.d0,           __w0, __w1);\
 		/* Need to add 2*a*b, so simply double b (which has at most 32 bits) prior to the MUL_LOHI: */\
 		MUL_LOHI64(__x.d0, __x.d1 << 1, __a , __b );\
@@ -3099,10 +3099,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 		uint64	__a0,__a1,__a2,__a3,\
 				__b0,__b1,__b2,__b3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\
 		\
 		SQR_LOHI    (__x0.d0,                             __a0 ,     __b0 );\
 		SQR_LOHI    (__x1.d0,                             __a1 ,     __b1 );\
@@ -3155,10 +3155,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 				__b0,__b1,__b2,__b3,\
 				__s0,__s1,__s2,__s3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -3215,10 +3215,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 				__b0,__b1,__b2,__b3,\
 				__s0,__s1,__s2,__s3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,              __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,              __a1 ,     __b1 );\
@@ -3269,10 +3269,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 				__b0,__b1,__b2,__b3,\
 				__s0,__s1,__s2,__s3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x3.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -3321,14 +3321,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 		uint64	__a0,__a1,__a2,__a3,__a4,__a5,__a6,__a7,\
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI    (__x0.d0,                             __a0 ,     __b0 );\
 		SQR_LOHI    (__x1.d0,                             __a1 ,     __b1 );\
@@ -3413,14 +3413,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\
 				__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -3509,14 +3509,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\
 				__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,              __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,              __a1 ,     __b1 );\
@@ -3599,14 +3599,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops.
 				__b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\
 				__s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
+		DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\
+		DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\
+		DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\
+		DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\
+		DBG_ASSERT((__x4.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\
+		DBG_ASSERT((__x5.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\
+		DBG_ASSERT((__x6.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\
+		DBG_ASSERT((__x7.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\
 		\
 		SQR_LOHI64(__x0.d0,                    __a0 ,     __b0 );\
 		SQR_LOHI64(__x1.d0,                    __a1 ,     __b1 );\
@@ -3671,7 +3671,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on
     {\
 		uint64 __w1,__w2,__w3,__a,__b,__c,__d,__cy;\
 		\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\
 		\
 		MULH64(    __x.d0,__y.d0,       __w1);\
 		MUL64x32(  __x.d0,__y.d1,&__a ,&__b );\
@@ -3680,7 +3680,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on
 		/* First add [a,b] + [c,d] : since b and d <= 2^64 - 2, can add carryout of a+c sans ripple-carry check: */\
 		__a  += __c;\
 		__b  += __d + (__a < __c);\
-		DBG_ASSERT(HERE, (__b >= __d),"MULH128x96: unexpected carryout of __b");\
+		DBG_ASSERT((__b >= __d),"MULH128x96: unexpected carryout of __b");\
 		/* Now add [w1,w2,w3] + [a,b,0]: */\
 		__w1 += __a;\
 		__cy  = (__w1 < __a);\
@@ -3731,7 +3731,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on
     {\
 		uint64 __w1,__w2,__w3,__a,__b,__c,__d,__cy;\
 		\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\
 		\
 		MULH64(    __x.d0,__y.d0,       __w1);\
 		MUL64x32(  __x.d0,__y.d1, __a , __b );\
@@ -3740,7 +3740,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on
 		/* First add [a,b] + [c,d] : since b and d <= 2^64 - 2, can add carryout of a+c sans ripple-carry check: */\
 		__a  += __c;\
 		__b  += __d + (__a < __c);\
-		DBG_ASSERT(HERE, (__b >= __d),"MULH128x96: unexpected carryout of __b");\
+		DBG_ASSERT((__b >= __d),"MULH128x96: unexpected carryout of __b");\
 		/* Now add [w1,w2,w3] + [a,b,0]: */\
 		__w1 += __a;\
 		__cy  = (__w1 < __a);\
@@ -3763,10 +3763,10 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on
 		uint64 __t2,__a2,__b2,__c2,__d2,__cy2;\
 		uint64 __t3,__a3,__b3,__c3,__d3,__cy3;\
 		\
-		DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH128x96_q4: (__y0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH128x96_q4: (__y1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH128x96_q4: (__y2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH128x96_q4: (__y3.d1 >> 32) == 0");\
+		DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH128x96_q4: (__y0.d1 >> 32) == 0");\
+		DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH128x96_q4: (__y1.d1 >> 32) == 0");\
+		DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH128x96_q4: (__y2.d1 >> 32) == 0");\
+		DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH128x96_q4: (__y3.d1 >> 32) == 0");\
 		\
 		MULH64(    __x0.d0,__y0.d0, __t0);\
 		MULH64(    __x1.d0,__y1.d0, __t1);\
@@ -3851,14 +3851,14 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on
 		uint64 __t6,__a6,__b6,__c6,__d6,__cy6;\
 		uint64 __t7,__a7,__b7,__c7,__d7,__cy7;\
 		\
-		DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH128x96_q8: (__y0.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH128x96_q8: (__y1.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH128x96_q8: (__y2.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH128x96_q8: (__y3.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y4.d1 >> 32) == 0,"MULH128x96_q8: (__y4.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y5.d1 >> 32) == 0,"MULH128x96_q8: (__y5.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y6.d1 >> 32) == 0,"MULH128x96_q8: (__y6.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y7.d1 >> 32) == 0,"MULH128x96_q8: (__y7.d1 >> 32) == 0");\
+		DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH128x96_q8: (__y0.d1 >> 32) == 0");\
+		DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH128x96_q8: (__y1.d1 >> 32) == 0");\
+		DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH128x96_q8: (__y2.d1 >> 32) == 0");\
+		DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH128x96_q8: (__y3.d1 >> 32) == 0");\
+		DBG_ASSERT((__y4.d1 >> 32) == 0,"MULH128x96_q8: (__y4.d1 >> 32) == 0");\
+		DBG_ASSERT((__y5.d1 >> 32) == 0,"MULH128x96_q8: (__y5.d1 >> 32) == 0");\
+		DBG_ASSERT((__y6.d1 >> 32) == 0,"MULH128x96_q8: (__y6.d1 >> 32) == 0");\
+		DBG_ASSERT((__y7.d1 >> 32) == 0,"MULH128x96_q8: (__y7.d1 >> 32) == 0");\
 		\
 		MULH64(    __x0.d0,__y0.d0, __t0);\
 		MULH64(    __x1.d0,__y1.d0, __t1);\
@@ -4359,7 +4359,7 @@ On Alpha, this needs a total of 7 MUL instructions and 12 ALU ops.
 	/* First add [a,b] + [c,d] : since b and d <= 2^64 - 2, can add carryout of a+c sans ripple-carry check: */\
 	__a  += __c;\
 	__b  += __d + (__a < __c);\
-	DBG_ASSERT(HERE, (__b >= __d),"MULH128: unexpected carryout of __b");\
+	DBG_ASSERT((__b >= __d),"MULH128: unexpected carryout of __b");\
 	/* Now add [w1,w2,w3] + [a,b,0]: */\
 	__w1 += __a;\
 	__cy  = (__w1 < __a);\
@@ -5327,8 +5327,8 @@ Similarly, (b.y+x.d)>>32 must be added to the MULH128 result.
     {\
 		uint64 __a,__b,__c,__d,__lo64;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\
 		\
 		__a  = (__x.d0) & (uint64)0x00000000ffffffff;\
 		__b  = (__y.d0) & (uint64)0x00000000ffffffff;\
@@ -5349,8 +5349,8 @@ Similarly, (b.y+x.d)>>32 must be added to the MULH128 result.
     {\
 		uint64 __a,__b,__c,__d,__lo64;\
 		\
-		DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\
+		DBG_ASSERT((__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\
+		DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\
 		\
 		__a  = (__x.d0) & (uint64)0x00000000ffffffff;\
 		__b  = (__y.d0) & (uint64)0x00000000ffffffff;\
@@ -5395,7 +5395,7 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s.
     {\
 		uint64 __w0,__w1,__w2,__w3,__w4,__a,__b,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\
+		DBG_ASSERT((__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\
 		/* First calculate high partial products and put into w3 and w4: */\
 		__t  = __x.d2;\
 		__w4 = __t * __t;						/*   x2^2 */\
@@ -5462,7 +5462,7 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s.
     {\
 		uint64 __w0,__w1,__w2,__w3,__w4,__a,__b,__t;\
 		\
-		DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\
+		DBG_ASSERT((__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\
 		/* First calculate high partial products and put into w3 and w4: */\
 		__t  = __x.d2;\
 		__w4 = __t * __t;						/*   x2^2 */\
@@ -5504,10 +5504,10 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s.
 			__wd0,__wd1,__wd2,__wd3,\
 			__we0,__we1,__we2,__we3;\
 		\
-		DBG_ASSERT(HERE, (__x0.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x0.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x1.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x2.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x3.d2 >> 32) == 0");\
+		DBG_ASSERT((__x0.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x0.d2 >> 32) == 0");\
+		DBG_ASSERT((__x1.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x1.d2 >> 32) == 0");\
+		DBG_ASSERT((__x2.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x2.d2 >> 32) == 0");\
+		DBG_ASSERT((__x3.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x3.d2 >> 32) == 0");\
 		\
 		__t0  = __x0.d2;\
 		__t1  = __x1.d2;\
@@ -5629,14 +5629,14 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s.
 			__wd0,__wd1,__wd2,__wd3,__wd4,__wd5,__wd6,__wd7,\
 			__we0,__we1,__we2,__we3,__we4,__we5,__we6,__we7;\
 		\
-		DBG_ASSERT(HERE, (__x0.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x0.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x1.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x1.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x2.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x2.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x3.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x3.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x4.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x4.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x5.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x5.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x6.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x6.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__x7.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x7.d2 >> 32) == 0");\
+		DBG_ASSERT((__x0.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x0.d2 >> 32) == 0");\
+		DBG_ASSERT((__x1.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x1.d2 >> 32) == 0");\
+		DBG_ASSERT((__x2.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x2.d2 >> 32) == 0");\
+		DBG_ASSERT((__x3.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x3.d2 >> 32) == 0");\
+		DBG_ASSERT((__x4.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x4.d2 >> 32) == 0");\
+		DBG_ASSERT((__x5.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x5.d2 >> 32) == 0");\
+		DBG_ASSERT((__x6.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x6.d2 >> 32) == 0");\
+		DBG_ASSERT((__x7.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x7.d2 >> 32) == 0");\
 		\
 		__t0  = __x0.d2;\
 		__t1  = __x1.d2;\
@@ -5949,8 +5949,8 @@ On 32-bit hardware, take advantage of the fact that x2 and y2 are only 32 bits w
     {\
 		uint64 __w1,__w2,__w3,__w4,__a,__b,__c,__d,__e,__f,__g,__h,__i,__j,__k,__l;\
 		\
-		DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\
+		DBG_ASSERT((__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\
+		DBG_ASSERT((__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\
 		\
 		__w4 = __x.d2*__y.d2;				/*   x2*y2 */\
 		MULH64(__x.d0,__y.d0,       __w1);	/*   x0*y0.hi */\
@@ -6033,8 +6033,8 @@ On 32-bit hardware, take advantage of the fact that x2 and y2 are only 32 bits w
     {\
 		uint64 __w1,__w2,__w3,__w4,__a,__b,__c,__d,__e,__f,__g,__h,__i,__j,__k,__l;\
 		\
-		DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\
-		DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\
+		DBG_ASSERT((__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\
+		DBG_ASSERT((__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\
 		\
 		__w4 = __x.d2*__y.d2;				/*   x2*y2 */\
 		MULH64(__x.d0,__y.d0,       __w1);	/*   x0*y0.hi */\
diff --git a/src/masterdefs.h b/src/masterdefs.h
index e74b70a2..cfb95084 100755
--- a/src/masterdefs.h
+++ b/src/masterdefs.h
@@ -56,7 +56,7 @@ in util.c), otherwise alias the entire 4-argument DBG_ASSERT invocation to "Boli
 	#define DBG_WARN   WARN
 	#define DBG_INFO   INFO
 #else	/* Bolivian - lump both the FILE and LINE args together as a single __here, that's why it looks like these take 1 less arg than the underlying functions: */
-	#define DBG_ASSERT(__here, __arg2, __arg3)	/* */
+	#define DBG_ASSERT(__arg1, __arg2)	/* */
 	#define DBG_WARN(__here, __arg2, __arg3, __arg4)	/* */
 	#define DBG_INFO(__here, __arg2, __arg3, __arg4)	/* */
 #endif
diff --git a/src/mers_mod_square.c b/src/mers_mod_square.c
index 6d35333b..acc20a51 100644
--- a/src/mers_mod_square.c
+++ b/src/mers_mod_square.c
@@ -221,7 +221,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 	// v20: got rid of 1st constraint, so we can use a single mode_flag value in p-1 stage 2 for both vecs we want to fwd-FFT-only
 	//      but input in fwd-FFT-pass-1-already-done mode and ones where we do both FFTs, input in said form and left so on return:
 	//	if(fwd_fft == 1ull)
-	//		ASSERT(HERE, mode_flag < 2, "Only low bit of mode_flag field may be used in this case!");
+	//		ASSERT(mode_flag < 2, "Only low bit of mode_flag field may be used in this case!");
 	}
 
 	/* These came about as a result of multithreading, but now are needed whether built unthreaded or multithreaded */
@@ -247,7 +247,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 
 	radix0 = RADIX_VEC[0];
 	nchunks = radix0>>1;
-	ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER, "mers_mod_square: Incorrect TRANSFORM_TYPE!");
+	ASSERT(TRANSFORM_TYPE == REAL_WRAPPER, "mers_mod_square: Incorrect TRANSFORM_TYPE!");
 
 /*...initialize things upon first entry */
 
@@ -272,7 +272,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 	{
 		if(!arr_scratch) {
 			sprintf(cbuf, "Init portion of %s requires non-null scratch array!",func);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		}
 		first_entry=FALSE;
 		psave = p;
@@ -284,7 +284,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(RADIX_VEC[i] == 0)
 			{
 				sprintf(cbuf, "%s: RADIX_VEC[i = %d] zero, for i < [NRADICES = %d]!",func,i,NRADICES);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			radix_set_save[i] = RADIX_VEC[i];
 		}
@@ -293,7 +293,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(RADIX_VEC[i] != 0)
 			{
 				sprintf(cbuf, "%s: RADIX_VEC[i = %d] nonzero, for i >= [NRADICES = %d]!",func,i,NRADICES);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			radix_set_save[i] = 0;
 		}
@@ -304,12 +304,12 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		/* My array padding scheme requires N/radix0 to be a power of 2, and to be >= 2^DAT_BITS, where the latter
 		parameter is set in the Mdata.h file: */
 		if(n%radix0 != 0) {
-			sprintf(cbuf  ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf  ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf);
 		}
 		/* Make sure n/radix0 is a power of 2: */
 		i = n/radix0;
 		if((i >> trailz32(i)) != 1) {
-			sprintf(cbuf  ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf  ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf);
 		}
 
 		if(DAT_BITS < 31)
@@ -317,7 +317,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			/* Now make sure n/radix0 is sufficiently large (unless n < 2^DAT_BITS, in which case it doesn't matter): */
 			if(i < (1 << DAT_BITS))
 			{
-			//	sprintf(cbuf  ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS));	fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			//	sprintf(cbuf  ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS));	fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				// Mar 2018: Switch to 'soft' assertion error here, e.g. for timing tests at small FFT lengths:
 				sprintf(cbuf  ,"n/radix0 must be >= %u! Skipping this radix combo.\n", (1 << DAT_BITS));	WARN(HERE, cbuf, "", 1); return(ERR_ASSERT);
 			}
@@ -327,7 +327,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			{
 				sprintf(cbuf  ,"ERROR: final FFT radix may not exceed = %u!\n", (1 << (DAT_BITS-1)));
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 		}
 
@@ -360,7 +360,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		{
 			sprintf(cbuf  ,"ERROR: product of radices not equal to complex vector length\n");
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 /*		index = (int *)calloc(k,sizeof(int));	*/
@@ -369,7 +369,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		{
 			sprintf(cbuf  ,"ERROR: unable to allocate array INDEX in %s.\n",func);
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 		index = ALIGN_INT(index_ptmp);
 
@@ -576,7 +576,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		default :
 			sprintf(cbuf  ,"ERROR: radix %d not available. Halting...\n",radix0);
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		for(i = 1; i < NRADICES; i++)
@@ -615,7 +615,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			default :
 				sprintf(cbuf  ,"ERROR: intermediate radix %d not available. Halting...\n",RADIX_VEC[i]);
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 
 			/* Final radix must be 16 or 32: */
@@ -623,7 +623,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			{
 				sprintf(cbuf  ,"ERROR: final radix %d not available. Halting...\n",RADIX_VEC[i]);
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 		}
 		nradices_prim = l;	for( ; l < 30; l++) { radix_prim[l] = 0; }	// Zero any higher elements which may have been previously set due
@@ -654,7 +654,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		{
 			sprintf(cbuf  ,"ERROR: NWT does not divide N!\n");
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		/*...The roots arrays need only be half the dimension of the weights arrays (since we need n/2 complex roots
@@ -665,10 +665,10 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		tmp = (double *)calloc(n/nwt+1       ,sizeof(double));
 		si  = (   int *)calloc(nwt+1         ,sizeof(   int));
 		*/
-		wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt+1         );	if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp);
-		wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, n/nwt+radix0  );	if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp);
-		tmp_ptmp = ALLOC_DOUBLE(tmp_ptmp, n/nwt+1       );	if(!tmp_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array TMP in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }; tmp = ALIGN_DOUBLE(tmp_ptmp);
-		si_ptmp  = ALLOC_INT   ( si_ptmp, nwt+1         );	if(!si_ptmp ){ sprintf(cbuf,"ERROR: unable to allocate array SI  in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }; si  = ALIGN_INT   (si_ptmp );
+		wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt+1         );	if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp);
+		wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, n/nwt+radix0  );	if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp);
+		tmp_ptmp = ALLOC_DOUBLE(tmp_ptmp, n/nwt+1       );	if(!tmp_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array TMP in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }; tmp = ALIGN_DOUBLE(tmp_ptmp);
+		si_ptmp  = ALLOC_INT   ( si_ptmp, nwt+1         );	if(!si_ptmp ){ sprintf(cbuf,"ERROR: unable to allocate array SI  in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }; si  = ALIGN_INT   (si_ptmp );
 
 		/******************************************************************/
 		/* Crandall/Fagin weighting factors and number of bits per digit. */
@@ -702,7 +702,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QWT1= %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		simodn=0;
@@ -722,7 +722,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QWT = %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 
 			wt0  [i] = t1;	/* Ith DWT weight factor = 2^[(s*i mod N)/N], where the exponent is done using floating divide.	*/
@@ -765,7 +765,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QWT2= %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		j=0;	/* Store I*K mod NN here. We don't directly calculate I*K, since that can overflow a 32-bit integer at large runlengths.	*/
@@ -785,7 +785,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QWT = %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			tmp[i] = t1;
 		/*fprintf(stderr,"I = %d; TMP = %20.10f\n",i,tmp[i]);	*/
@@ -837,7 +837,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		(i.e. will be accessed using the lower lg(NRT) bits of the integer sincos index):
 		*/
 		rt0_ptmp = ALLOC_COMPLEX(rt0_ptmp, NRT);
-		if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		rt0 = ALIGN_COMPLEX(rt0_ptmp);
 
 		qt     = i64_to_q((int64)N2);
@@ -861,7 +861,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QCOS1= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		t1 = qfdbl(qi);
@@ -878,7 +878,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QSIN1= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		qt = QZRO;
@@ -900,7 +900,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt0[i].re = t1;
 
@@ -919,7 +919,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: I = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt0[i].im = t1;
 
@@ -936,7 +936,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		(and will be accessed using the upper bits, <lg(NRT):31>, of the integer sincos index):
 		*/
 		rt1_ptmp = ALLOC_COMPLEX(rt1_ptmp, n/(2*NRT));
-		if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		rt1 = ALIGN_COMPLEX(rt1_ptmp);
 
 		qn     = i64_to_q((int64)NRT);
@@ -962,7 +962,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QCOS2= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		t1 = qfdbl(qi);
@@ -979,7 +979,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			if(idiff > max_idiff)
 				max_idiff = idiff;
 			sprintf(cbuf,"INFO: QSIN2= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff);
-			fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 
 		qt = QZRO;
@@ -1002,7 +1002,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt1[i].re = t1;
 
@@ -1021,7 +1021,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 				if(idiff > max_idiff)
 					max_idiff = idiff;
 				sprintf(cbuf,"INFO: J = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff);
-				fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			rt1[i].im = t1;
 //if((i & 63) ==0)printf("rt1[%3u] = %20.15f, %20.15f\n",i,rt1[i].re,rt1[i].im);
@@ -1130,7 +1130,7 @@ for(i=0; i < NRT; i++) {
 
 		/* 8/23/2004: Need to allocate an extra element here to account for the padding element that gets inserted when radix0 is odd: */
 		block_index = (int *)calloc((radix0+1),sizeof(int));
-		if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		/*
 		Examples:
 
@@ -1214,7 +1214,7 @@ for(i=0; i < NRT; i++) {
 				*/
 				for(j = 0; j < 2; j++)
 				{
-					if(!(l >= 0 && l < radix0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+					if(!(l >= 0 && l < radix0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 					if((blocklen & 1) && j == 1)
 					{
@@ -1248,14 +1248,14 @@ for(i=0; i < NRT; i++) {
 		}		/* End of Main loop */
 
 		/* arrays storing the index values needed for the parallel-block wrapper/square scheme: */
-		if( !(ws_i            = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_j1           = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_j2           = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_j2_start     = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_k            = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_m            = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_blocklen     = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_blocklen_sum = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if( !(ws_i            = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_j1           = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_j2           = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_j2_start     = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_k            = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_m            = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_blocklen     = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_blocklen_sum = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 		for(ii = 0; ii < radix0; ii += 2)
 		{
@@ -1292,7 +1292,7 @@ for(i=0; i < NRT; i++) {
 					*/
 					default :
 					sprintf(cbuf,"ERROR: radix %d not available for wrapper_square. Halting...\n",RADIX_VEC[NRADICES-1]);
-					fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 			}
 		}
@@ -1302,7 +1302,7 @@ for(i=0; i < NRT; i++) {
 			fprintf(stderr, "%s:\n",func);
 			fprintf(stderr, " Max abs error between real*8 and real*16 computed values = %20.15f\n",         max_adiff);
 			fprintf(stderr, " Max bit error between real*8 and real*16 computed values = %20.0f \n", (double)max_idiff);
-			ASSERT(HERE, (max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting.");
+			ASSERT((max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting.");
 		}
 
 	#ifdef MULTITHREAD
@@ -1346,13 +1346,13 @@ for(i=0; i < NRT; i++) {
 	// Threadpool-based dispatch:
 
 		// MAX_THREADS is the max. no. of threads we expect to be able to make use of, at 1 thread per core.
-		ASSERT(HERE, MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!");
+		ASSERT(MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!");
 
 		if(nchunks % NTHREADS != 0) fprintf(stderr,"%s: radix0/2 not exactly divisible by NTHREADS - This will hurt performance.\n",func);
 
 		main_work_units = 0;
 		pool_work_units = nchunks;
-		ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+		ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 		printf("%s: Init threadpool of %d threads\n",func,NTHREADS);
 
 	#endif	// MULTITHREAD?
@@ -1671,11 +1671,11 @@ for(i=0; i < NRT; i++) {
 	// v19: Add support for mod_mul with one input being in precomputed fwd-FFTed form:
 #ifdef MULTITHREAD
 	for(i = 0; i < nchunks; ++i) { tdat[i].arrdat = a; tdat[i].fwd_fft = fwd_fft; tdat[i].c = c; }
-//	printf("Thread 0: arrdat = 0x%llX, fwd_fft = 0x%llX\n",tdat[0].arrdat,tdat[0].fwd_fft);
+//	printf("Thread 0: arrdat = %#" PRIX64 ", fwd_fft = %#" PRIX64 "\n",tdat[0].arrdat,tdat[0].fwd_fft);
 #endif
 
 	/*...Init clock counter:	*/
-	ASSERT(HERE, tdiff != 0,"mers_mod_square.c: NULL tdiff ptr!");
+	ASSERT(tdiff != 0,"mers_mod_square.c: NULL tdiff ptr!");
 
 #ifdef CTIME
 	clock1 = clock();
@@ -1693,12 +1693,12 @@ for(i=0; i < NRT; i++) {
 	*/
 	// Sep 2019: Add support for fwd_fft_only|mode_flag as described in top-of-function comments
 	if(fwd_fft == 2ull) {
-	//	fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = 0x%llX: jumping directly to undo_initial_ffft_pass.\n",ilo,ihi,(uint64)a);
+	//	fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = %#" PRIX64 ": jumping directly to undo_initial_ffft_pass.\n",ilo,ihi,(uint64)a);
 		goto undo_initial_ffft_pass;
 	}
 	if((mode_flag & 1) == 0)
 	{
-//	if(ihi<1000 && !fwd_fft)fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = 0x%llX, Fwd-WT: mode_flag = 0x%X, a[1] = %18.10f\n",ilo,ihi,(uint64)a,mode_flag,a[1]);
+//	if(ihi<1000 && !fwd_fft)fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = %#" PRIX64 ", Fwd-WT: mode_flag = %#X, a[1] = %18.10f\n",ilo,ihi,(uint64)a,mode_flag,a[1]);
 		// Mar 2017: Can skip this step if it's the start of a production test (note that any initial-residue shift
 		// in such cases is handled via single-array-word forward-DWT-weighting in the Mlucas.c shift_word() function),
 		// but need it if add RNG-input-setting above for debug, hence also check a[1] for nonzero:
@@ -1729,10 +1729,10 @@ for(i=0; i < NRT; i++) {
 				simodn += sw;	if(simodn >= n) simodn -= n;
 				bimodn += bw;	if(bimodn >= n) bimodn -= n;
 			//	if(simodn != n - bimodn) printf("I = %d: simodn[%u] != n - bimodn[%u]\n",i,simodn,n - bimodn);
-			//	ASSERT(HERE, simodn == n - bimodn, "simodn != n - bimodn");	<*** cannot require this because (for i = n-1) have simodn = 0, bimodn = n,
-				ASSERT(HERE, DNINT(a[j]) == a[j],"mers_mod_square.c: Input a[j] noninteger!");
+			//	ASSERT(simodn == n - bimodn, "simodn != n - bimodn");	<*** cannot require this because (for i = n-1) have simodn = 0, bimodn = n,
+				ASSERT(DNINT(a[j]) == a[j],"mers_mod_square.c: Input a[j] noninteger!");
 				fracmax = fabs( wt*wtinv*radix0 - 1.0 );
-				ASSERT(HERE, fracmax < 1e-10, "wt*wtinv check failed!");
+				ASSERT(fracmax < 1e-10, "wt*wtinv check failed!");
 				a[j] *= wt;
 				ii =((uint32)(sw - bimodn) >> 31);
 			}
@@ -1756,7 +1756,7 @@ for(i=0; i < NRT; i++) {
 	*/
 	ierr = 0;	/* Any return-value error code (whether fatal or not) stored here */
 
-	ASSERT(HERE, ihi > ilo,"mers_mod_square.c: ihi <= ilo!");
+	ASSERT(ihi > ilo,"mers_mod_square.c: ihi <= ilo!");
 
   #if DBG_THREADS
 	fprintf(stderr,"%s: NTHREADS = %3d\n",func,NTHREADS);
@@ -1798,7 +1798,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 	while(tpool->free_tasks_queue.num_tasks != pool_work_units) {
 	//		sleep(1);	//*** too granular ***
 		// Finer-resolution, declared in <time.h>; cf. http://linux.die.net/man/2/nanosleep
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!");
 	//	printf("sleep; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 	}
 //	printf("end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
@@ -1814,7 +1814,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 #endif
 
 	if(fwd_fft == 1) {
-	//	fprintf(stderr,"[ilo,ihi] = [%u,%u]: fwd_fft = %llu, mode_flag = %u: exiting after fwd-FFT.\n",ilo,ihi,fwd_fft,mode_flag);
+	//	fprintf(stderr,"[ilo,ihi] = [%u,%u]: fwd_fft = %" PRIu64 ", mode_flag = %u: exiting after fwd-FFT.\n",ilo,ihi,fwd_fft,mode_flag);
 		return 0;	// Skip carry step [and preceding inverse-FFT] in this case
 	}
 	// Update RES_SHIFT via mod-doubling, *** BUT ONLY IF IT'S AN AUTOSQUARE ***:
@@ -1931,7 +1931,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 			ierr = radix4096_ditN_cy_dif1    (a,n,nwt,nwt_bits,wt0,wt1,si,0x0,0x0,base,baseinv,iter,&fracmax,p); break;
 	*/
 		default :
-			sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 	}
 
 	// v19: Nonzero exit carries used to be fatal, added retry-from-last-savefile handling for these
@@ -2025,15 +2025,15 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++)
 // On early-exit-due-to-interrupt, decrement iter since we didn't actually do the (iter)th iteration
 if(!MLUCAS_KEEP_RUNNING) {
 	iter--;
-//	fprintf(stderr,"%s: fwd_fft_only = 0x%016X, fwd_fft = %X; Caught interrupt at iter = %u; --iter = %u\n",func,fwd_fft_only,fwd_fft,iter+1,iter);
+//	fprintf(stderr,"%s: fwd_fft_only = %#016X, fwd_fft = %X; Caught interrupt at iter = %u; --iter = %u\n",func,fwd_fft_only,fwd_fft,iter+1,iter);
 }
 if(iter < ihi) {
-	ASSERT(HERE, !MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!");
+	ASSERT(!MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!");
 	ierr = ERR_INTERRUPT;
 	ROE_ITER = iter;	// Function return value used for error code, so save number of last-iteration-completed-before-interrupt here
-//	fprintf(stderr,"Caught signal at iter = %u; mode_flag = 0x%X\n",iter,mode_flag);
+//	fprintf(stderr,"Caught signal at iter = %u; mode_flag = %#X\n",iter,mode_flag);
 	mode_flag &= 0xfffffffd;	// v20: In case of interrupt-exit override any mode_flag "skip undo of initial DIF pass" setting
-//	fprintf(stderr,"After ^2-toggle, mode_flag = 0x%X, (mode_flag >> 1) = 0x%X\n",mode_flag,mode_flag>>1);
+//	fprintf(stderr,"After ^2-toggle, mode_flag = %#X, (mode_flag >> 1) = %#X\n",mode_flag,mode_flag>>1);
 }
 
 #ifdef RTIME
@@ -2060,7 +2060,7 @@ if(iter < ihi) {
 
 	if((mode_flag >> 1) == 0)
 	{
-	//	fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = 0x%llX, Inv-WT: mode_flag = 0x%X\n",ilo,ihi,(uint64)a,mode_flag);
+	//	fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = %#" PRIX64 ", Inv-WT: mode_flag = %#X\n",ilo,ihi,(uint64)a,mode_flag);
 		func_dit1(a,n);
 
 	/*...and unweight the data array.	*/
@@ -2127,7 +2127,7 @@ if(iter < ihi) {
 	// [action] Prior to returning, print a "retry successful" informational and rezero ROE_ITER and ROE_VAL.
 	// *** v19: For PRP-test Must make sure we are at end of checkpoint-file iteration interval, not one of the Gerbicz-update subintervals ***
 	if(!INTERACT && ROE_ITER > 0 && ihi%ITERS_BETWEEN_CHECKPOINTS == 0) {	// In interactive (timing-test) mode, use ROE_ITER to accumulate #iters-with-dangerous-ROEs
-		ASSERT(HERE, (ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!");
+		ASSERT((ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!");
 		ROE_ITER = 0;
 		ROE_VAL = 0.0;
 		sprintf(cbuf,"Retry of iteration interval with fatal roundoff error was successful.\n");
@@ -2196,7 +2196,7 @@ void mers_process_chunk(
 	dyadic-multiply FFT(a) * FFT(b) and iFFT the product, storing the result in a[].
 	*/
   if((fwd_fft & 0xC) != 0) {
-	ASSERT(HERE, ((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *");
+	ASSERT(((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *");
   }	else {
 	for(j = 0; j < jhi; j++)
 	{
@@ -2226,7 +2226,7 @@ void mers_process_chunk(
 			case 32 :
 				radix32_dif_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break;
 			default :
-				sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 
 			k    += mm*radix0;
@@ -2262,7 +2262,7 @@ void mers_process_chunk(
 			radix64_wrapper_square(a,arr_scratch,n,radix0,rt0,rt1,nradices_prim,radix_prim, ws_i[l], ws_j1[l], ws_j2[l], ws_j2_start[l], ws_k[l], ws_m[l], ws_blocklen[l], ws_blocklen_sum[l],init_sse2,thr_id, fwd_fft, c); break;
 		*/
 		default :
-			sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 		}
 	}
 
@@ -2286,7 +2286,7 @@ void mers_process_chunk(
 	{
 		/* Get block index of the chunk of contiguous data to be processed: */
 		l = block_index[ii + j];
-		ASSERT(HERE, l >= 0,"mers_mod_square.c: l >= 0");
+		ASSERT(l >= 0,"mers_mod_square.c: l >= 0");
 
 		/* Quick-n-dirty way of generating the correct starting values of k, mm and incr -
 		simply use the skeleton of the forward (DIF) loop, sans the i = NRADICES-2 pass
@@ -2330,7 +2330,7 @@ void mers_process_chunk(
 			case 32 :
 				radix32_dit_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break;
 			default :
-				sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+				sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 		}	/* end i-loop */
 
diff --git a/src/mi64.c b/src/mi64.c
index 08bb5c88..1a031a4f 100755
--- a/src/mi64.c
+++ b/src/mi64.c
@@ -78,7 +78,7 @@ __device__ uint32 mi64_twopmodq_gpu(
 	hi    = gpu_thread_local + lenQ*6;
 	scratch  = hi + lenQ;
 	scratch2 = hi + lenQ*2;
-	cyout = mi64_mul_scalar(p,k<<1,q,lenQ);	ASSERT(HERE, 0 == cyout, "unexpected carryout of 2*p*k!");
+	cyout = mi64_mul_scalar(p,k<<1,q,lenQ);	ASSERT(0 == cyout, "unexpected carryout of 2*p*k!");
 	q[0] += 1;	// q = 2.k.p + 1; No need to check for carry since 2.k.p even
 	mi64_shrl_short(q, qhalf, 1, lenQ);	/* (q >> 1) = (q-1)/2, since q odd. */
 
@@ -163,7 +163,7 @@ __device__ uint32 mi64_twopmodq_gpu(
 	return mi64_cmp_eq_scalar(x, 1ull, lenQ);
 
 #else	// ifndef __CUDA_ARCH__
-	ASSERT(HERE, 0, "Device code being called in host mode!");
+	ASSERT(0, "Device code being called in host mode!");
 	return 0;
 #endif
 }
@@ -208,7 +208,7 @@ void	mi64_brev(uint64 x[], uint32 n)
 		x[wi] &= ~mi;			x[wj] &= ~mj;			// Mask off the bits to be swapped
 		x[wi] ^= bj<<i;			x[wj] ^= bi<<j;			// XOR each the just-zeroed bits with the bit-to-be-swapped-in
 	}
-printf("0x%2X,",(uint8)x[0]);
+printf("%#2X,",(uint8)x[0]);
 }
 /*
 Bytewise version:
@@ -267,9 +267,9 @@ __device__
 void	mi64_set_eq(uint64 x[], const uint64 y[], uint32 len)
 {
 	uint32 i;
-	ASSERT(HERE, len != 0, "zero-length array!");
+	ASSERT(len != 0, "zero-length array!");
 	if(x == y) return;
-//	ASSERT(HERE, !ARRAYS_OVERLAP(x,len, y,len), "Input arrays overlap!");	* Fairly expensive to check, so disable by default *
+//	ASSERT(!ARRAYS_OVERLAP(x,len, y,len), "Input arrays overlap!");	* Fairly expensive to check, so disable by default *
 	for(i = 0; i < len; ++i) {
 		x[i] = y[i];
 	}
@@ -282,7 +282,7 @@ __device__
 void	mi64_set_eq_scalar(uint64 x[], const uint64 a, uint32 len)
 {
 	uint32 i;
-	ASSERT(HERE, len != 0, "zero-length array!");
+	ASSERT(len != 0, "zero-length array!");
 	x[0] = a;
 	for(i = 1; i < len; ++i) {
 		x[i] = 0ull;
@@ -318,7 +318,7 @@ uint64	mi64_shl(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 	int i;
 	uint32 nwshift = (nshift >> 6), rembits = (nshift & 63), m64bits;
 	uint64 lo64 = 0ull;
-	ASSERT(HERE, len != 0, "mi64_shl: zero-length array!");
+	ASSERT(len != 0, "mi64_shl: zero-length array!");
 	// Special-casing for 0 shift count:
 	if(!nshift) {
 		if(x != y) mi64_set_eq(y, x, len);	// Set y = x
@@ -415,13 +415,13 @@ __device__
 #endif
 void	mi64_shlc(const uint64 x[], uint64 y[], uint32 nbits, uint32 nshift, uint32 len, uint32 sign_flip)
 {	/**** NOTE: The (nbits+63) here means the largest exponent currently testable is 4294967231 = 2^32-65, larger ones like 4294967291 = 2^32-5 overflow uint32 => nwmod = 0 ****/
-	ASSERT(HERE, nshift <= nbits && (nbits+63) <= 0xFFFFFFFFu, "mi64_shlc: Require (nshift <= nbits) and (nbits+63) < 2^32!");
+	ASSERT(nshift <= nbits && (nbits+63) <= 0xFFFFFFFFu, "mi64_shlc: Require (nshift <= nbits) and (nbits+63) < 2^32!");
 	uint32 i = nbits&63, nwshift = (nshift+63) >> 6, nwmod = ((nbits + 63)>>6);	// Here nwshift includes any partial words in addition to fullwords
-	ASSERT(HERE, x && len, "mi64_shlc: null input pointer or zero-length array!");
+	ASSERT(x && len, "mi64_shlc: null input pointer or zero-length array!");
 	// W/o the extra "& (nbits&63)" this assumes nbits != 0, i.e. unsuitable for Fermats:
 	uint64 cy, mask64 = (-1ull << i) & -(uint64)(i != 0);	// = (-1ull << i) if Mersenne, 0 if Fermat
-	ASSERT(HERE, (x[len-1] & mask64) == 0ull, "mi64_shlc: x[] has set bits beyond [nbits] position in high word!");
-//	printf("mi64_shlc: %u bits, %u limbs, mask64 = 0x%llX, high limb = 0x%llX\n",nbits,len,mask64,x[len-1]);
+	ASSERT((x[len-1] & mask64) == 0ull, "mi64_shlc: x[] has set bits beyond [nbits] position in high word!");
+//	printf("mi64_shlc: %u bits, %u limbs, mask64 = %#" PRIX64 ", high limb = %#" PRIX64 "\n",nbits,len,mask64,x[len-1]);
   #ifndef __CUDA_ARCH__
 	/* Scratch array for storing off-shifted intermediate (need this to support in-place functionality): */
 	static uint64 *u = 0x0;
@@ -431,10 +431,10 @@ void	mi64_shlc(const uint64 x[], uint64 y[], uint32 nbits, uint32 nshift, uint32
 	if(dimU < 2*(nwmod+1)) {                // GG: fixed bug in comparison
 		dimU = 2*(nwmod+1);
 		// Alloc 2x the immediately-needed to avoid excessive reallocs if needed size increases incrementally
-		u = (uint64 *)realloc(u, dimU*sizeof(uint64));	ASSERT(HERE, u != 0x0, "alloc failed!");
+		u = (uint64 *)realloc(u, dimU*sizeof(uint64));	ASSERT(u != 0x0, "alloc failed!");
 	}
   #endif
-	ASSERT(HERE, nshift <= nbits, "mi64_shlc: shift count must be <= than bits in modulus!");	// This also ensures (nwshift < nwmod)
+	ASSERT(nshift <= nbits, "mi64_shlc: shift count must be <= than bits in modulus!");	// This also ensures (nwshift < nwmod)
 	// Special-casing for 0 shift count, which includes the 1-full-rotation case nshift == nbits:
 	if(!nshift || (nshift == nbits)) {
 		if(x != y) mi64_set_eq(y, x, len);	// Set y = x
@@ -451,10 +451,10 @@ void	mi64_shlc(const uint64 x[], uint64 y[], uint32 nbits, uint32 nshift, uint32
 			if(nwshift < len)
 				cy = mi64_sub_scalar(y+nwshift,cy,y+nwshift,len-nwshift);
 			// In Fermat-mod case, if high bits happen to = 0, must (mod Fm) by adding borrow = 1 back into low limb:
-			ASSERT(HERE, mi64_sub_scalar(y,cy,y,len) == 0ull, "Nonzero carryout of (mod Fm) low-limb incrementing!");
+			ASSERT(mi64_sub_scalar(y,cy,y,len) == 0ull, "Nonzero carryout of (mod Fm) low-limb incrementing!");
 		}
 	} else {
-		cy = mi64_add(y, u, y, nwshift);	ASSERT(HERE, cy == 0ull, "Nonzero carryout of nonoverlapping vector add!");
+		cy = mi64_add(y, u, y, nwshift);	ASSERT(cy == 0ull, "Nonzero carryout of nonoverlapping vector add!");
 	}
 }
 
@@ -493,8 +493,8 @@ uint32	mi64_shlc_bits_align(const uint64 x[], uint64 y[], uint32 nbits)
 	uint32 len = (nbits+63)>>6, i,match = 0, curr_word,curr_bit,main_part,high_part,hi_word_bits = nbits&63;
 	// W/o the extra "& (nbits&63)" this assumes nbits != 0, i.e. unsuitable for Fermats:
 	uint64 mask64 = (-1ull << (nbits&63)) & (uint64)(nbits&63);
-	ASSERT(HERE, x && y && len, "mi64_shlc_bits_align: null input pointer or zero-length array!");
-	ASSERT(HERE, (x[len-1] & mask64) == 0ull && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_align: x or y has set bits beyond [nbits] position in high word!");
+	ASSERT(x && y && len, "mi64_shlc_bits_align: null input pointer or zero-length array!");
+	ASSERT((x[len-1] & mask64) == 0ull && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_align: x or y has set bits beyond [nbits] position in high word!");
 	// Special-casing for in-place and 0-length case:
 	if(!nbits || (x == y)) return 0;
 	// Special-casing for single-word inputs:
@@ -546,8 +546,8 @@ uint32	mi64_shlc_bits_limb0(const uint64 x0, const uint64 y[], uint32 nbits)
 	uint32 len = (nbits+63)>>6, i, curr_word,curr_bit,main_part,high_part,hi_word_bits = nbits&63;
 	// W/o the extra "& (nbits&63)" this assumes nbits != 0, i.e. unsuitable for Fermats:
 	uint64 mask64 = (-1ull << (nbits&63)) & (uint64)(nbits&63);
-	ASSERT(HERE, y && len, "mi64_shlc_bits_limb0: null input pointer or zero-length array!");
-	ASSERT(HERE, (nbits > 64 || (x0 & mask64) == 0ull) && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_limb0: x or y has set bits beyond [nbits] position in high word!");
+	ASSERT(y && len, "mi64_shlc_bits_limb0: null input pointer or zero-length array!");
+	ASSERT((nbits > 64 || (x0 & mask64) == 0ull) && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_limb0: x or y has set bits beyond [nbits] position in high word!");
 	// Special-casing for 0-length case:
 	if(!nbits) return 0;
 	// Special-casing for single-word inputs:
@@ -601,7 +601,7 @@ uint64	mi64_shrl(const uint64 x[], uint64 y[], uint32 nshift, uint32 len, uint32
 	int i;
 	uint32 nwshift = (nshift >> 6), rembits = (nshift & 63), m64bits;
 	uint64 hi64 = 0ull;
-	ASSERT(HERE, len != 0, "mi64_shrl: zero-length array!");
+	ASSERT(len != 0, "mi64_shrl: zero-length array!");
 	/*
 	Ex 1: len = 1132 = 72448 bits, nshift = 70000, nwshift = 70000>>6 = 1093, rembits = 70000%64 = 48, m64bits = 64-rembits = 16
 	Thus we want the hi 2448 bits (38 full words + 16 bits) of x, and require output_len >= 39 .
@@ -613,7 +613,7 @@ uint64	mi64_shrl(const uint64 x[], uint64 y[], uint32 nshift, uint32 len, uint32
 	But user has specified output_len = 16, meaning they want at most 1024 bits of x[], so only copy that many and exit.
 	So allow output_len to be 1 limb smaller than 17 as a fudge factor to handle arbitrary in-word copy-bit boundaries:
 	*/
-	ASSERT(HERE, output_len >= (len-nwshift)-1, "mi64_shrl: output_len must be large enough to hold result!");
+	ASSERT(output_len >= (len-nwshift)-1, "mi64_shrl: output_len must be large enough to hold result!");
 	// Special-casing for 0 shift count:
 	if(!nshift) {
 		if(x != y) {
@@ -677,7 +677,7 @@ uint64	mi64_shl_short_ref(const uint64 x[], uint64 y[], uint32 nshift, uint32 le
 	int i;
 	uint32 m64bits = (64-nshift);
 	uint64 lo64 = 0ull;
-	ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
+	ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
 	// Special-casing for 0 shift count:
 	if(!nshift) {
 		if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; }
@@ -715,7 +715,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 	int i, i0 = 0, i1 = 1, use_asm = FALSE, x_misalign = 0, y_misalign = 0;
 	uint32 m64bits = (64-nshift), leftover = 0;
 	uint64 lo64 = 0ull;
-	ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
+	ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
 	// Special-casing for 0 shift count:
 	if(!nshift) {
 		if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; }
@@ -770,7 +770,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 			3. if x,y have same 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, find i0 >= 1 such that x[i0] is SIMD-aligned.
 		*/
 		if( ((uintptr_t)x & 0x7) != 0 || ((uintptr_t)y & 0x7) != 0 )
-			ASSERT(HERE, 0, "require 8-byte alignment of x,y!");
+			ASSERT(0, "require 8-byte alignment of x,y!");
 		// In SIMD-ASM case, x_misalign = (0,1,2, or 3) how many words x[0] is above next-lower alignment boundary:
 		x_misalign = ((uintptr_t)x & BASEADDRMASK)>>3;	y_misalign = ((uintptr_t)y & BASEADDRMASK)>>3;
 
@@ -808,7 +808,7 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 	*/
   #if MI64_SHL1_DBG
 	if(dbg)
-		printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = 0x%X,0x%X, base-addr for SHL macro = 0x%X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2));
+		printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = %#X,%#X, base-addr for SHL macro = %#X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2));
   #endif
 	// Full-vector (except for x[0]) processing loop if no ASM; high-words cleanup-loop if ASM:
 	for(i = len-1; i >= i1; i--) {
@@ -1102,19 +1102,19 @@ uint64	mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 	// Low-end clean-up loop (only used in ASM-loop case):
 	for(i = i0-1; i > 0; i--) {
 	#if MI64_SHL1_DBG
-		if(dbg) printf("Low-end clean-up loop: x[%u,%u] = 0x%16llX,0x%16llX; <<%u,>>%u = 0x%16llX,0x%16llX\n",i,i-1,x[i],x[i-1],nshift,m64bits,(x[i] << nshift),(x[i-1] >> m64bits));
+		if(dbg) printf("Low-end clean-up loop: x[%u,%u] = %#16" PRIX64 ",%#16" PRIX64 "; <<%u,>>%u = %#16" PRIX64 ",%#16" PRIX64 "\n",i,i-1,x[i],x[i-1],nshift,m64bits,(x[i] << nshift),(x[i-1] >> m64bits));
 	#endif
 		y[i] = (x[i] << nshift) + (x[i-1] >> m64bits);
 	#if MI64_SHL1_DBG
-		if(dbg) printf("    ==> y[%u] = 0x%16llX\n",i,y[i]);
+		if(dbg) printf("    ==> y[%u] = %#16" PRIX64 "\n",i,y[i]);
 	#endif
 	}
 	// Least-significant element gets zeros shifted in from the right:
 	y[0] = (x[0] << nshift);
   #if MI64_SHL1_DBG
 	if(len < 1000) {
-		if(lo64 != ref[len]) { printf("SHL1 Carryout mismatch: (y[%u] = %16llX) != (ref[%u] = %16llX)\n",len,lo64,len,ref[len]); ASSERT(HERE, 0, "Exiting!"); }
-		if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16llX) != (ref[%u] = %16llX)\n",i,y[i],i,ref[i]); printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, misalign = %u, use_asm = %u; x,y = 0x%X,0x%X, base-addr for SHL macro = 0x%X\n",nshift,len,i0,i1,leftover,x_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); ASSERT(HERE, 0, "Exiting!"); } } }
+		if(lo64 != ref[len]) { printf("SHL1 Carryout mismatch: (y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",len,lo64,len,ref[len]); ASSERT(0, "Exiting!"); }
+		if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",i,y[i],i,ref[i]); printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, misalign = %u, use_asm = %u; x,y = %#X,%#X, base-addr for SHL macro = %#X\n",nshift,len,i0,i1,leftover,x_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); ASSERT(0, "Exiting!"); } } }
 	}
   #endif
 	return lo64;
@@ -1134,7 +1134,7 @@ uint64	mi64_shrl_short_ref(const uint64 x[], uint64 y[], uint32 nshift, uint32 l
 	int i;
 	uint32 m64bits = (64-nshift), leftover = 0;
 	uint64 hi64 = 0ull;
-	ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
+	ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
 	// Special-casing for 0 shift count:
 	if(!nshift) {
 		if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; }
@@ -1169,7 +1169,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 	int i, i0 = 0, i1 = 0, use_asm = FALSE, x_misalign, y_misalign;
 	uint32 m64bits = (64-nshift), leftover = 0;
 	uint64 hi64 = 0ull;
-	ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
+	ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!");
 	// Special-casing for 0 shift count:
 	if(!nshift) {
 		if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; }
@@ -1223,7 +1223,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 			3. if x,y have same 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, find i0 >= 0 such that x[i0] is SIMD-aligned.
 		*/
 		if( ((uintptr_t)x & 0x7) != 0 || ((uintptr_t)y & 0x7) != 0 )
-			ASSERT(HERE, 0, "require 8-byte alignment of x,y!");
+			ASSERT(0, "require 8-byte alignment of x,y!");
 		x_misalign = ((uintptr_t)x & BASEADDRMASK)>>3;	y_misalign = ((uintptr_t)y & BASEADDRMASK)>>3;
 
 		// minlen may have been incr. for alignment purposes, so use_asm not an unconditional TRUE here
@@ -1253,7 +1253,7 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 
   #if MI64_SHR1_DBG
 	if(dbg)
-		printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = 0x%X,0x%X, base-addr for SHRL macro = 0x%X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2));
+		printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = %#X,%#X, base-addr for SHRL macro = %#X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2));
   #endif
 	// Low-end cleanup-loop if ASM:
 	for(i = 0; i < i0; i++) {
@@ -1607,8 +1607,8 @@ uint64	mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
 
   #if MI64_SHR1_DBG
 	if(len < 1000) {
-		if(hi64 != ref[len]) { printf("SHR1 Carryout mismatch: (y[%u] = %16llX) != (ref[%u] = %16llX)\n",len,hi64,len,ref[len]); ASSERT(HERE, 0, "Exiting!"); }
-		if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16llX) != (ref[%u] = %16llX)\n",i,y[i],i,ref[i]); ASSERT(HERE, 0, "Exiting!"); } } }
+		if(hi64 != ref[len]) { printf("SHR1 Carryout mismatch: (y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",len,hi64,len,ref[len]); ASSERT(0, "Exiting!"); }
+		if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",i,y[i],i,ref[i]); ASSERT(0, "Exiting!"); } } }
 	}
   #endif
 	return hi64;
@@ -1623,7 +1623,7 @@ uint32	mi64_cmpult(const uint64 x[], const uint64 y[], uint32 len)
 {
 	uint32 i;
 	// Need hard-assert here due to zero-element default compare:
-	ASSERT(HERE, len != 0, "mi64_cmpult: zero-length array!");
+	ASSERT(len != 0, "mi64_cmpult: zero-length array!");
 	for(i = len-1; i !=0 ; i--)	/* Loop over all but the 0 elements while equality holds.... */
 	{
 		if(x[i] < y[i]) {
@@ -1643,7 +1643,7 @@ uint32	mi64_cmp_eq(const uint64 x[], const uint64 y[], uint32 len)
 	uint32 i;
 	// Allow for zero-length here with default return TRUE,
 	// according to the convention that a zero-length mi64 object = 0:
-	ASSERT(HERE, len != 0, "mi64_cmp_eq: zero-length array!");	//  allows us to catch zero-length cases in debug build & test
+	ASSERT(len != 0, "mi64_cmp_eq: zero-length array!");	//  allows us to catch zero-length cases in debug build & test
 	for(i = 0; i < len; i++) {
 		if(x[i] != y[i])
 			return FALSE;
@@ -1656,7 +1656,7 @@ __device__
 #endif
 uint32	mi64_cmplt_scalar(const uint64 x[], uint64 a, uint32 len)
 {
-	ASSERT(HERE, len != 0, "zero-length array!");
+	ASSERT(len != 0, "zero-length array!");
 	return ( (mi64_getlen(x, len) <= 1) && (x[0] < a) );
 }
 
@@ -1665,7 +1665,7 @@ __device__
 #endif
 uint32	mi64_cmpgt_scalar(const uint64 x[], uint64 a, uint32 len)
 {
-	ASSERT(HERE, len != 0, "zero-length array!");
+	ASSERT(len != 0, "zero-length array!");
 	return ( (x[0] > a) || (mi64_getlen(x, len) > 1) );
 }
 
@@ -1674,7 +1674,7 @@ __device__
 #endif
 uint32	mi64_cmp_eq_scalar(const uint64 x[], uint64 a, uint32 len)
 {
-	ASSERT(HERE, len != 0, "mi64_cmp_eq_scalar: zero-length array!");
+	ASSERT(len != 0, "mi64_cmp_eq_scalar: zero-length array!");
 	return ( (x[0] == a) && (mi64_getlen(x+1, len-1) == 0) );
 }
 
@@ -1745,7 +1745,7 @@ int	mi64_ith_set_bit(const uint64 x[], uint32 bit, uint32 len)
 {
 	int curr_pop,i,j,retval = 0;
 	if(!len || !bit) return -1;
-	ASSERT(HERE, bit <= (len<<6), "[bit]th-bit specifier out of range!");
+	ASSERT(bit <= (len<<6), "[bit]th-bit specifier out of range!");
 	// Find the word in which the [bit]th set-bit occurs:
 	for(i = 0; i < len; i++) {
 		curr_pop = popcount64(x[i]);
@@ -1769,7 +1769,7 @@ __device__
 uint32	mi64_trailz(const uint64 x[], uint32 len)
 {
 	uint32 i, tz = 0;
-	ASSERT(HERE, len != 0, "mi64_trailz: zero-length array!");
+	ASSERT(len != 0, "mi64_trailz: zero-length array!");
 	for(i = 0; i < len; i++, tz += 64) {
 		if(x[i]) {
 			return tz + trailz64(x[i]);
@@ -1832,8 +1832,8 @@ __device__
 #endif
 void mi64_md5(uint64 x[], uint32 len, uint64 md5[], char*const md5_str)
 {
-	ASSERT(HERE, x != 0x0, "mi64_md5: null input pointer!");
-	ASSERT(HERE, md5_str != 0x0, "mi64_md5: null md5_str pointer!");
+	ASSERT(x != 0x0, "mi64_md5: null input pointer!");
+	ASSERT(md5_str != 0x0, "mi64_md5: null md5_str pointer!");
 	md5_str[0] = '\0';	// should be null on entry, but better safe than sorry
 	uint32 i,j, lz = mi64_leadz(x,len);	// lz = #leading 0-bits in x
 	uint32 n = len<<6;	// n = 64*len = #bits in the [len] words of x, including leading 0-bits
@@ -1841,7 +1841,7 @@ void mi64_md5(uint64 x[], uint32 len, uint64 md5[], char*const md5_str)
 	// Compute the working length [nword]:
 	uint32 nblock = (nbit+576)>>9;	// needed number of 512-bit data chucks: nblock = (nbit+576)/512
 	uint32 nword = nblock<<3;		// nword = 8*#blocks: From here on will use that as the working length
-	ASSERT(HERE, len >= nword, "mi64_md5: input-vector lacks sufficient 0-padding!");
+	ASSERT(len >= nword, "mi64_md5: input-vector lacks sufficient 0-padding!");
 	// Pre-processing:
 	// 1. first a single bit, 1, is appended to the end of the message:
 	mi64_set_bit(x,nbit,nword,1);	// nword here is only used by mi64_set_bit() for bounds-checking
@@ -1931,15 +1931,15 @@ uint32 mi64_extract_lead64(const uint64 x[], uint32 len, uint64*result)
 {
 	uint32 i,nshift,nwshift,rembits;
 
-	ASSERT(HERE, len != 0, "mi64_extract_lead64: zero-length array!");
+	ASSERT(len != 0, "mi64_extract_lead64: zero-length array!");
 
 	nshift = mi64_leadz(x, len);
 	nwshift = (nshift >> 6);
 	rembits = (nshift & 63);
 	/* shift-word count may == len, but only if x[] = 0: */
 	if(nwshift >= len) {
-		ASSERT(HERE, nwshift == len, "mi64_extract_lead64: nwshift out of range!");
-		ASSERT(HERE, mi64_iszero(x, len), "mi64_extract_lead64: expected zero-valued array!");
+		ASSERT(nwshift == len, "mi64_extract_lead64: nwshift out of range!");
+		ASSERT(mi64_iszero(x, len), "mi64_extract_lead64: expected zero-valued array!");
 		*result = 0ull;
 	} else {
 		i = len-1-nwshift;
@@ -1964,19 +1964,19 @@ double	mi64_cvt_double(const uint64 x[], uint32 len)
 	if(lead64 == 0ull) {
 		return 0.0;
 	}
-	ASSERT(HERE,(lead64 >> 63) == 1ull, "mi64_cvt_double: lead64 lacks leftmost ones bit!");
+	ASSERT((lead64 >> 63) == 1ull, "mi64_cvt_double: lead64 lacks leftmost ones bit!");
 	/*  round based on 1st neglected bit: */
 	lead64_rnd = (lead64 >> 11) + ((lead64 >> 10) & 0x0000000000000001ull);
 	/* exponent: */
 	itmp64 = (((uint64)0x3FD + (uint64)pow2) << 52);
 	/* Add in mantissa, with hidden bit made explicit, hence the 0x3FD (rather than 0x3FE) initializer */
 	itmp64 += lead64_rnd;
-	ASSERT(HERE, itmp64 > lead64_rnd , "mi64_cvt_double: Exponent overflows IEEE64 field");
+	ASSERT(itmp64 > lead64_rnd , "mi64_cvt_double: Exponent overflows IEEE64 field");
 	/* GCC bug: needed to add the explicit sign-check below, otherwise GCC 'optimizes' away the (*(double *)&itmp64): */
 	retval = *(double *)&itmp64;
 	if(retval < 0.0) {
-		sprintf(cbuf, "rng_isaac_rand_double_norm_pos: lead64 = %16llx, itmp64 = %16llx, retval = %lf not in [0,1]!\n", lead64, itmp64, retval);
-		ASSERT(HERE, 0, cbuf);
+		sprintf(cbuf, "rng_isaac_rand_double_norm_pos: lead64 = %16" PRIx64 ", itmp64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", lead64, itmp64, retval);
+		ASSERT(0, cbuf);
 	}
 	return retval;
 }
@@ -2002,8 +2002,8 @@ void mi64_extract_lead128(const uint64 x[], uint32 len, uint32 nshift, uint64 le
 {
 	lead_x[0] = lead_x[1] = 0;
 
-	ASSERT(HERE, len != 0, "mi64_extract_lead128: zero-length array!");
-	ASSERT(HERE, nshift < 64, "mi64_extract_lead128: illegal nshift value!");
+	ASSERT(len != 0, "mi64_extract_lead128: zero-length array!");
+	ASSERT(nshift < 64, "mi64_extract_lead128: illegal nshift value!");
 
 	/* Syntax reminder:
 		MVBITS(from_integer,low_bit_of_from_integer,num_bits,to_integer,insert_bits_in_to_integer_starting_at_this_low_bit)
@@ -2100,7 +2100,7 @@ uint64	mi64_add_ref(const uint64 x[], const uint64 y[], uint64 z[], uint32 len)
 {
 	uint32 i;
 	uint64 tmp, cy = 0;
-	ASSERT(HERE, len != 0, "mi64_add: zero-length array!");
+	ASSERT(len != 0, "mi64_add: zero-length array!");
 
 	for(i = 0; i < len; i++) {
 		tmp = x[i] + cy;
@@ -2145,7 +2145,7 @@ uint64	mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len,
 		// SdyBr: 6.60
 		uint32 i;
 		uint64 tmp, cy = 0;
-		ASSERT(HERE, len != 0, "mi64_add: zero-length array!");
+		ASSERT(len != 0, "mi64_add: zero-length array!");
 
 		for(i = 0; i < len; i++) {
 			tmp = x[i] + cy;
@@ -2250,7 +2250,7 @@ uint64	mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len,
 		//
 		uint32 i, odd = (len&1), len2 = len >> 1;
 		uint64 tmp, cy = 0, c2 = 0;
-	ASSERT(HERE, has_sse42() != 0, "This ASM requires SSE4.2, which is unavailable on this CPU!");
+	ASSERT(has_sse42() != 0, "This ASM requires SSE4.2, which is unavailable on this CPU!");
 		if(len2) {
 		/* x86_64 ASM implementation of the add/carry loop: */
 		__asm__ volatile (\
@@ -2328,7 +2328,7 @@ uint64	mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len,
 		// Jun 2016: bizarre ... GCC builds with opt > 0 on Haswell/Broadwell init this != 0 ...
 		//  making static not a reliable workaround, so try put cy = 0 init on separate line from declaration:
 		uint64 cy;
-		cy = 0ull;	ASSERT(HERE, cy == 0, "Init (cy = 0) fails!");
+		cy = 0ull;	ASSERT(cy == 0, "Init (cy = 0) fails!");
 		/* x86_64 ASM implementation of the add/carry loop: */
 		__asm__ volatile (\
 			"movq	%[__x0],%%rax	\n\t"/* &x[0] */\
@@ -2385,7 +2385,7 @@ uint64	mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len,
 		uint32 i, lrem = (len&7), len8 = len >> 3;
 		uint64 tmp, cy = 0, c2 = 0;
 	#error mi64_add: no AVX512 support yet!
-	ASSERT(HERE, has_avx512() != 0, "This ASM requires AVX512, which is unavailable on this CPU!");
+	ASSERT(has_avx512() != 0, "This ASM requires AVX512, which is unavailable on this CPU!");
 vpcmpuq
 *** how to encode the base.offset data? ***
 vpgatherqq	%%zmmM,%%zmmD[255]	// zmmM has base_addr and
@@ -2484,17 +2484,17 @@ uint64	mi64_sub(const uint64 x[], const uint64 y[], uint64 z[], uint32 len)
 	uint32 i;
 	uint64 tmp, tmp2, bw = 0;
 
-	ASSERT(HERE, len != 0, "mi64_sub: zero-length array!");
+	ASSERT(len != 0, "mi64_sub: zero-length array!");
 	for(i = 0; i < len; i++) {
 		tmp = x[i] - bw;
 		bw  = (tmp > x[i]);
 //bw  = ((uint64)tmp > (uint64)x[i]);
-		ASSERT(HERE, bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!");
+		ASSERT(bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!");
 		/* Need an extra temp here due to asymmetry of subtract: */
 		tmp2= tmp - y[i];
 		bw += (tmp2 > tmp);
 //bw += ((uint64)tmp2 > (uint64)tmp);
-		ASSERT(HERE, (tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!");
+		ASSERT((tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!");
 		z[i] = tmp2;
 	}
 	return bw;
@@ -2509,17 +2509,17 @@ uint64	mi64_sub_bwin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len,
 	uint32 i;
 	uint64 tmp, tmp2, bw = bwin;
 
-	ASSERT(HERE, len != 0, "mi64_sub: zero-length array!");
+	ASSERT(len != 0, "mi64_sub: zero-length array!");
 	for(i = 0; i < len; i++) {
 		tmp = x[i] - bw;
 		bw  = (tmp > x[i]);
 //bw  = ((uint64)tmp > (uint64)x[i]);
-		ASSERT(HERE, bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!");
+		ASSERT(bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!");
 		/* Need an extra temp here due to asymmetry of subtract: */
 		tmp2= tmp - y[i];
 		bw += (tmp2 > tmp);
 //bw += ((uint64)tmp2 > (uint64)tmp);
-		ASSERT(HERE, (tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!");
+		ASSERT((tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!");
 		z[i] = tmp2;
 	}
 	return bw;
@@ -2565,7 +2565,7 @@ uint64	mi64_add_scalar(const uint64 x[], uint64 a, uint64 y[], uint32 len)
 {
 	uint32 i;
 	uint64 cy = a;
-	ASSERT(HERE, x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!");
+	ASSERT(x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!");
 	if(x == y) {
 		/* In-place: Only need to proceed until carry peters out: */
 		for(i = 0; i < len; i++) {
@@ -2595,7 +2595,7 @@ uint64	mi64_sub_scalar(const uint64 x[], uint64 a, uint64 y[], uint32 len)
 {
 	uint32 i;
 	uint64 bw = a, tmp;
-	ASSERT(HERE, x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!");
+	ASSERT(x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!");
 	if(x == y) {
 		/* In-place: Only need to proceed until borrow peters out: */
 		for(i = 0; i < len; i++) {
@@ -2672,7 +2672,7 @@ uint64	mi64_mul_scalar(const uint64 x[], uint64 a, uint64 y[], uint32 len)
 		cy = hi + (y[i++] < lo);
 	}
 	// Cleanup loop for remaining terms:
-	ASSERT(HERE, len != 0, "zero-length array!");
+	ASSERT(len != 0, "zero-length array!");
 	for(; i < len; i++)
 	{
 	#ifdef MUL_LOHI64_SUBROUTINE
@@ -2712,12 +2712,12 @@ __device__
 uint64	mi64_mul_scalar_add_vec2(const uint64 x[], uint64 a, const uint64 y[], uint64 z[], uint32 len)
 {
 	uint64 cy;	// Jul 2016: Same GCC bug as detailed in mi64_add
-	cy = 0ull;	ASSERT(HERE, cy == 0, "Init (cy = 0) fails!");
+	cy = 0ull;	ASSERT(cy == 0, "Init (cy = 0) fails!");
 #if MI64_MSAV2
 	uint64 *u = 0x0, *v = 0x0;
 	uint64 c2;
 	u = (uint64 *)calloc(len, sizeof(uint64));	v = (uint64 *)calloc(len, sizeof(uint64));
-	ASSERT(HERE, u != 0x0 && v != 0x0, "calloc failed!");
+	ASSERT(u != 0x0 && v != 0x0, "calloc failed!");
 	memcpy(v,y,(len<<3));	// Save copy of x[]
 	c2  = mi64_mul_scalar(x, a, u, len);
 	c2 += mi64_add(u, y, u, len);
@@ -2751,7 +2751,7 @@ uint64	mi64_mul_scalar_add_vec2(const uint64 x[], uint64 a, const uint64 y[], ui
 		cy += (z[i] < tmp);
 	}
 	// Cleanup loop for remaining terms:
-	ASSERT(HERE, len != 0, "zero-length array!");
+	ASSERT(len != 0, "zero-length array!");
 	for(; i < len; i++)
 	{
 	#ifdef MUL_LOHI64_SUBROUTINE
@@ -2850,10 +2850,10 @@ uint64	mi64_mul_scalar_add_vec2(const uint64 x[], uint64 a, const uint64 y[], ui
 	if(!mi64_cmp_eq(u,z,len) || (cy != c2)) {
 		for(i = 0; i < len; i++) {
 		//	if(u[i] != z[i])
-			printf("i = %u Error: U = %20llu, Z = %20llu, Diff = %20lld\n",i,u[i],z[i],(int64)(u[i]-z[i]) );
+			printf("i = %u Error: U = %20" PRIu64 ", Z = %20" PRIu64 ", Diff = %20" PRId64 "\n",i,u[i],z[i],(int64)(u[i]-z[i]) );
 		}
-		if(cy != c2) printf("Carry Error: c2 = %20llu, cy = %20llu, Diff = %20lld\n",c2,cy,(int64)(c2-cy) );
-		ASSERT(HERE, 0, "mi64_add ASM result incorrect!");
+		if(cy != c2) printf("Carry Error: c2 = %20" PRIu64 ", cy = %20" PRIu64 ", Diff = %20" PRId64 "\n",c2,cy,(int64)(c2-cy) );
+		ASSERT(0, "mi64_add ASM result incorrect!");
 	}
 	free((void *)u); u = 0x0;
 	free((void *)v); v = 0x0;
@@ -2897,12 +2897,12 @@ void	mi64_mul_vector(const uint64 x[], uint32 lenX, const uint64 y[], uint32 len
 	static uint64 *u = 0x0;
 	static uint32 dimU = 0;
   #endif
-	ASSERT(HERE, x && y && z, "Null array x/y/z!");
-	ASSERT(HERE, lenX != 0, "zero-length X-array!");
-	ASSERT(HERE, lenY != 0, "zero-length Y-array!");
-	ASSERT(HERE, x != z, "X and Z point to same array object!");
-	ASSERT(HERE, y != z, "Y and Z point to same array object!");
-	ASSERT(HERE, lenZ != 0x0, "Null lenZ pointer!");
+	ASSERT(x && y && z, "Null array x/y/z!");
+	ASSERT(lenX != 0, "zero-length X-array!");
+	ASSERT(lenY != 0, "zero-length Y-array!");
+	ASSERT(x != z, "X and Z point to same array object!");
+	ASSERT(y != z, "Y and Z point to same array object!");
+	ASSERT(lenZ != 0x0, "Null lenZ pointer!");
 
 	/* Init z[] = 0: */
 	for(i = 0; i < lenX + lenY; i++) { z[i] = 0; }
@@ -2935,7 +2935,7 @@ void	mi64_mul_vector(const uint64 x[], uint32 lenX, const uint64 y[], uint32 len
 		if(dimU < 2*(lenA+1)) {         // GG: fixed bug in comparison
 			dimU = 2*(lenA+1);
 			// Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally
-			u = (uint64 *)realloc(u, dimU*sizeof(uint64));	ASSERT(HERE, u != 0x0, "alloc failed!");
+			u = (uint64 *)realloc(u, dimU*sizeof(uint64));	ASSERT(u != 0x0, "alloc failed!");
 		}
 	#endif
 		/* Loop over remaining (lenB-1) elements of B[], multiplying A by each, and
@@ -2951,7 +2951,7 @@ void	mi64_mul_vector(const uint64 x[], uint32 lenX, const uint64 y[], uint32 len
 	more leading terms of the result is zero, caller can adjust vector length accordingly:
 	*/
 	*lenZ = mi64_getlen(z, *lenZ);
-	ASSERT(HERE, *lenZ <= lenA + lenB, "*lenZ > (lenA + lenB)!");
+	ASSERT(*lenZ <= lenA + lenB, "*lenZ > (lenA + lenB)!");
 }
 
 /* Squaring-specialized version of above. By way of example, consider a length-10 input vector and
@@ -3054,11 +3054,11 @@ void	mi64_sqr_vector(const uint64 x[], uint64 z[], uint32 len)
 		if(dbg) printf("realloc to dimU = %u\n",dimU);
 	  #endif
 		// Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally
-		u = (uint64 *)realloc(u, 4* len   *sizeof(uint64));	ASSERT(HERE, u != 0x0, "alloc failed!");
+		u = (uint64 *)realloc(u, 4* len   *sizeof(uint64));	ASSERT(u != 0x0, "alloc failed!");
 	}
   #endif
-	ASSERT(HERE, z != x, "Input and output arrays must be distinct!");
-	ASSERT(HERE, len != 0, "zero-length X-array!");
+	ASSERT(z != x, "Input and output arrays must be distinct!");
+	ASSERT(len != 0, "zero-length X-array!");
 
 	memset(z, 0ull,(len8<<1));	// Clear z[0,...,2*len-1]
 
@@ -3067,7 +3067,7 @@ void	mi64_sqr_vector(const uint64 x[], uint64 z[], uint32 len)
 		z[len] = mi64_mul_scalar(x+1, x[0], z+1, len-1);
 	  #if MI64_SQR_DBG
 		if(dbg) {
-			printf("x0*x[1...n-1] = %llu * %s...\n",x[0],&cbuf[convert_mi64_base10_char(cbuf,x+1,len-1,0)]);
+			printf("x0*x[1...n-1] = %" PRIu64 " * %s...\n",x[0],&cbuf[convert_mi64_base10_char(cbuf,x+1,len-1,0)]);
 			printf("            ... -> z = %s...\n",&cbuf[convert_mi64_base10_char(cbuf,z,2*len,0)]);
 		}
 	  #endif
@@ -3077,7 +3077,7 @@ void	mi64_sqr_vector(const uint64 x[], uint64 z[], uint32 len)
 			z[len+j] = mi64_mul_scalar_add_vec2(x+i, x[j], z+i+j, z+i+j, len-i);
 		  #if MI64_SQR_DBG
 			if(dbg) {
-				printf("x%u*x[%u...n-1] = %llu * %s...\n",j,i,x[j],&cbuf[convert_mi64_base10_char(cbuf,x+i,len-i,0)]);
+				printf("x%u*x[%u...n-1] = %" PRIu64 " * %s...\n",j,i,x[j],&cbuf[convert_mi64_base10_char(cbuf,x+i,len-i,0)]);
 				printf("          ... += z = %s...\n",&cbuf[convert_mi64_base10_char(cbuf,z,2*len,0)]);
 			}
 		  #endif
@@ -3151,14 +3151,14 @@ void	mi64_mul_vector_lo_half	(const uint64 x[], const uint64 y[], uint64 z[], ui
 	/* Scratch array for storing intermediate scalar*vector products: */
 	static uint64 *u = 0x0;
 	static uint32 dimU = 0;
-	ASSERT(HERE, x && y && z, "Null array pointer!");
-	ASSERT(HERE, len != 0, "zero-length X-array!");
+	ASSERT(x && y && z, "Null array pointer!");
+	ASSERT(len != 0, "zero-length X-array!");
 	// Does scratch array need allocating or reallocating? (Use realloc for both cases):
 	if(dimU < 2*(len+1)) {          // GG: fixed bug in comparison
 		dimU = 2*(len+1);
 		// Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally
 		u = (uint64 *)realloc(u, 2*(len+1)*sizeof(uint64));	// NB: realloc leaves newly-alloc'ed size fraction uninited
-		ASSERT(HERE, u != 0x0, "alloc failed!");
+		ASSERT(u != 0x0, "alloc failed!");
 	}
 	memset(u, 0ull, (len<<4));	// Accumulator u[] needs to be cleared each time
   #endif
@@ -3208,11 +3208,11 @@ void	mi64_mul_vector_hi_half	(const uint64 x[], const uint64 y[], uint64 z[], ui
 		// Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally
 		u = (uint64 *)realloc(u, 2*(len+1)*sizeof(uint64));
 		v = (uint64 *)realloc(v, 4* len   *sizeof(uint64));
-		ASSERT(HERE, u != 0x0 && v != 0x0, "alloc failed!");
+		ASSERT(u != 0x0 && v != 0x0, "alloc failed!");
 	}
 	memset(v, 0ull, (len<<4));	// Accumulator v[] needs to be cleared each time
   #endif
-	ASSERT(HERE, len != 0, "zero-length X-array!");
+	ASSERT(len != 0, "zero-length X-array!");
 
 	/* Loop over the elements of y[], multiplying x[] by each, and
 	using u[] as a scratch array to store x[]*y[j] prior to adding to z[].
@@ -3230,7 +3230,7 @@ void	mi64_mul_vector_hi_half	(const uint64 x[], const uint64 y[], uint64 z[], ui
 			continue;
 		u[len] = mi64_mul_scalar(x, y[j], u, len);
 	#if MI64_MULHI_DBG
-		if(dbg) { printf("mi64_mul_vector_hi_half: j = %d, cy = %20llu, U = %s\n",j,u[len], &cbuf[convert_mi64_base10_char(cbuf, u, len+1, 0)]); }
+		if(dbg) { printf("mi64_mul_vector_hi_half: j = %d, cy = %20" PRIu64 ", U = %s\n",j,u[len], &cbuf[convert_mi64_base10_char(cbuf, u, len+1, 0)]); }
 	#endif
 		/* Add j-word-left-shifted u[] to v[]: */
 		/*** 11/2013: Simply could not get this to work using any opt-level > 0 under debian/gcc4.6 ***/
@@ -3240,7 +3240,7 @@ void	mi64_mul_vector_hi_half	(const uint64 x[], const uint64 y[], uint64 z[], ui
 		if(dbg) { printf("mi64_mul_vector_hi_half: j = %d, V = %s\n",j, &cbuf[convert_mi64_base10_char(cbuf, v, len+j+1, 0)]); }
 		if(dbg) {
 			for(i=0;i<=len;++i) {
-				printf("v[%2d] = %20llu\n",i+j,v[i+j]);
+				printf("v[%2d] = %20" PRIu64 "\n",i+j,v[i+j]);
 			}
 		}
 	#endif
@@ -3350,14 +3350,14 @@ void	mi64_mul_vector_hi_trunc(const uint64 x[], const uint64 y[], uint64 z[], ui
 	uint64 tprod[2], cy;
 	static uint64 *u = 0x0, *v = 0x0;	// Scratch arrays for storing intermediate scalar*vector products
 	static uint32 dimU = 0;
-	ASSERT(HERE, len != 0, "zero-length X-array!");
+	ASSERT(len != 0, "zero-length X-array!");
 	// Does scratch array need allocating or reallocating? (Use realloc for both cases):
 	if(dimU < 2*(len+1)) {          // GG: fixed bug in comparison
 		dimU = 2*(len+1);
 		// Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally
 		u = (uint64 *)realloc(u, (len+1)<<4);	// Realloc with 2*(len+1)*sizeof(uint64) bytes
 		v = (uint64 *)realloc(v,  len   <<5);	// Realloc with 4*(len  )*sizeof(uint64) bytes
-		ASSERT(HERE, u != 0x0 && v != 0x0, "alloc failed!");
+		ASSERT(u != 0x0 && v != 0x0, "alloc failed!");
 	}
 	/*
 	Compute desired row-sums by row index. For row j (j renamed 'idx' in function below):
@@ -3401,7 +3401,7 @@ void	mi64_mul_vector_hi_trunc(const uint64 x[], const uint64 y[], uint64 z[], ui
 	// Test code for fast version of this function - re-use low half of v[] for output::
 	mi64_mul_vector_hi_half(x,y,v,len);
 	if(!mi64_cmp_eq(v,v+len,len)) {
-		ASSERT(HERE,0,"mi64_mul_vector_hi_trunc result incorrect!");
+		ASSERT(0,"mi64_mul_vector_hi_trunc result incorrect!");
 	}
   #endif
 	/* Copy v[len:2*len-1] into z[0:len-1]: */
@@ -3458,23 +3458,23 @@ void	mi64_mul_vector_hi_qmmp(const uint64 y[], const uint64 p, const uint64 k, u
 		}
 		u = (uint64 *)calloc(ldim, sizeof(uint64));
 		v = (uint64 *)calloc(ldim, sizeof(uint64));
-		ASSERT(HERE, u != 0x0 && v != 0x0, "alloc failed!");
+		ASSERT(u != 0x0 && v != 0x0, "alloc failed!");
 	}
   #endif
 //====need to finish 200-bit support! =======================
-	ASSERT(HERE, z != y, "Input and output arrays must be distinct!");
-	ASSERT(HERE, p < bits, "shift parameters out of range!");
-	ASSERT(HERE, len != 0, "zero-length X-array!");
+	ASSERT(z != y, "Input and output arrays must be distinct!");
+	ASSERT(p < bits, "shift parameters out of range!");
+	ASSERT(len != 0, "zero-length X-array!");
 	for(i = len+1; i < len2; i++) {
 		u[i] = 0ull;	// With proper padding of U don't need any zeroing of V prior to V = (U << p) step below
 	}
 	// memset(v, 0ull, (len<<4));	// No need to clear Accumulator v[] here due to dim = len2 in mi64_shl below
-	ASSERT(HERE, (k != 0) && ((k2>>1) == k), "2*k overflows!");	// Make sure 2*k did not overflow
+	ASSERT((k != 0) && ((k2>>1) == k), "2*k overflows!");	// Make sure 2*k did not overflow
 	u[len] = mi64_mul_scalar(y,k2,u,len);	// u[] stores Z = 2.k.Y
 	mi64_shl(u,v,p,len2);			// v[] stores (Z << p), store result in V
 	u[len] -= mi64_sub(u,y,u,len);	// (2k-1).Y = Z-Y, store result in U
 	bw = mi64_sub(v,u,v,len+1);
-	ASSERT(HERE, !bw, "Unexpected borrow!");
+	ASSERT(!bw, "Unexpected borrow!");
 
 	/* Right-shift by B bits to get UMULH(q,Y) = ((Z << p) - (2k-1).Y) >> B: */
 	mi64_shrl(v,v,bits,len2,len2);
@@ -3491,13 +3491,13 @@ void	mi64_mul_vector_hi_qmmp(const uint64 y[], const uint64 p, const uint64 k, u
 	u[0] = 1;
 	mi64_shl(u, u, p, len);			// 2^p
 	mi64_sub_scalar(u, 1, u, len);	// M(p) = 2^p-1
-	ASSERT(HERE, 0 == mi64_mul_scalar(u, k2, u, len), "2.k.M(p) overflows!");	// 2.k.M(p)
+	ASSERT(0 == mi64_mul_scalar(u, k2, u, len), "2.k.M(p) overflows!");	// 2.k.M(p)
 	mi64_add_scalar(u, 1ull, u, len);	// q = 2.k.M(p) + 1
 	// Test code for fast version of this function - re-use v[] for output::
 //	mi64_mul_vector_hi_half(u,y,v,len);
 	mi64_mul_vector_hi_fast(y,p,k,v,len);
 	if(!mi64_cmp_eq(v,z,len)) {
-		ASSERT(HERE, 0, "mi64_mul_vector_hi_qmmp/fast results differ!");
+		ASSERT(0, "mi64_mul_vector_hi_qmmp/fast results differ!");
 	}
 #endif
 }
@@ -3569,22 +3569,22 @@ void	mi64_mul_vector_hi_fast(const uint64 y[], const uint64 p, const uint64 k, u
 	uint32 i, bits;
 	uint64 k2m1 = k-1+k, tmp,bw0,bw1,bw,cw,cy,cz;
 	uint64 *zptr;
-	ASSERT(HERE, z != y, "Input and output arrays must be distinct!");
-	ASSERT(HERE, (k != 0) && ((k2m1>>1) == k-1), "2*k-1 overflows!");
-	ASSERT(HERE, len != 0, "zero-length X-array!");
+	ASSERT(z != y, "Input and output arrays must be distinct!");
+	ASSERT((k != 0) && ((k2m1>>1) == k-1), "2*k-1 overflows!");
+	ASSERT(len != 0, "zero-length X-array!");
 
 // 1. compute z' = (2k-1).y via vector-scalar mul, the carryout word cw = ((2k-1).Y >> B);
 	cw = mi64_mul_scalar(y,k2m1,z,len);	// z' = (2k-1).y
 	bw0 = z[len-1];
-//if(k==900) printf("Mi64: bw0 = %20llu, cw = %20llu, z` = %s\n", bw0,cw,&s0[convert_mi64_base10_char(s0, z, len, 0)]);
+//if(k==900) printf("Mi64: bw0 = %20" PRIu64 ", cw = %20" PRIu64 ", z` = %s\n", bw0,cw,&s0[convert_mi64_base10_char(s0, z, len, 0)]);
 // 2. compute low n words of z = z' + y via vector-vector add, any carryout of that gets added to a 2nd copy of cw, cz;
 	cz = cw + mi64_add(y,z,z, len);	// z = z' + y
-//if(k==900) printf("Mi64: cz = %20llu, z = %s\n", cz,&s0[convert_mi64_base10_char(s0, z, len, 0)]);
+//if(k==900) printf("Mi64: cz = %20" PRIu64 ", z = %s\n", cz,&s0[convert_mi64_base10_char(s0, z, len, 0)]);
 
 // 3. compute low n words of z >> (b-p), then separately shift in cz from the left, via (2^b*cz) >> (b-p) = (cz << p).
-	ASSERT(HERE, (len<<6) > p, "shift parameters out of range!");
+	ASSERT((len<<6) > p, "shift parameters out of range!");
 	bw1 = mi64_shrl(z,z,(len<<6)-p,len,len);	// low n words of z >> (b-p); high 64 bits of off-shifted portion saved in bw1
-//if(k==900) printf("Mi64: bw1 = %20llu, z>> = %s\n", bw1,&s0[convert_mi64_base10_char(s0, z, len, 0)]);
+//if(k==900) printf("Mi64: bw1 = %20" PRIu64 ", z>> = %s\n", bw1,&s0[convert_mi64_base10_char(s0, z, len, 0)]);
 
 /* Check for borrow-on-subtract of to-be-off-shifted sections: have a borrow if
 	z' (result from above mul_scalar, not including the carryout word cw) >	((z << p) % 2^b) (off-shifted portion of z = z' + y above, left-justified to fill a b-bit field)
@@ -3600,18 +3600,18 @@ and tell the user to call the slow exact version of this function, currently ina
 	zptr = z+i;
 	// If (b-p) == 0 (mod 64) all of cz goes into z[i], with i = (b-p)/64;
 	if(bits == 0) {
-		ASSERT(HERE, 0 == mi64_add_scalar(zptr,cz,zptr,len-i), "unexpected carryout of ( + cw)!");
+		ASSERT(0 == mi64_add_scalar(zptr,cz,zptr,len-i), "unexpected carryout of ( + cw)!");
 	// Otherwise cz gets split between z[i] and z[i+1]:
 	} else {
 		// low 64-(p%64) bits of cz = (cz << bits) go into z[i]:
 		tmp = (cz << bits);
 		*zptr += tmp; cy = (*zptr++ < tmp);
 		// high (p%64) bits of cw = (cw >> bits) go into z[i+1]
-		ASSERT(HERE, 0 == mi64_add_scalar(zptr,(cz >> (64-bits)) + cy,zptr,len-i-1), "unexpected carryout of ( + cw).hi!");
+		ASSERT(0 == mi64_add_scalar(zptr,(cz >> (64-bits)) + cy,zptr,len-i-1), "unexpected carryout of ( + cw).hi!");
 	}
 
 // 4. subtract scalar (bw + cw) from resulting vector to effect ... - (2k-1).Y step in [*].
-	ASSERT(HERE, 0 == mi64_sub_scalar(z,(bw + cw),z,len), "unexpected carryout of (... - cw) !");
+	ASSERT(0 == mi64_sub_scalar(z,(bw + cw),z,len), "unexpected carryout of (... - cw) !");
 }
 
 
@@ -3658,12 +3658,12 @@ void	mi64_mul_vector_hi_qferm(const uint64 y[], const uint64 p, const uint64 k,
 			free((void *)u); u = 0x0;
 		}
 		u = (uint64 *)calloc(ldim, sizeof(uint64));
-		ASSERT(HERE, u != 0x0, "alloc failed!");
+		ASSERT(u != 0x0, "alloc failed!");
 	}
   #endif
-	ASSERT(HERE, z != y, "Input and output arrays must be distinct!");
-	ASSERT(HERE, p < bits, "shift parameters out of range!");
-	ASSERT(HERE, k != 0ull, "k must be nonzero!");
+	ASSERT(z != y, "Input and output arrays must be distinct!");
+	ASSERT(p < bits, "shift parameters out of range!");
+	ASSERT(k != 0ull, "k must be nonzero!");
 	for(i = len+1; i < len2; i++) {
 		u[i] = 0ull;	// With proper padding of U don't need any zeroing of V prior to V = (U << p) step below
 	}
@@ -3672,7 +3672,7 @@ void	mi64_mul_vector_hi_qferm(const uint64 y[], const uint64 p, const uint64 k,
 	mi64_shl(u,u,(p+1),len2);				// u[] stores (Z << p)
 	cy = mi64_add(u,y,u,len);
 	cy = mi64_add_scalar(u+len,cy,u+len, len2-len);
-	ASSERT(HERE, (cy == 0ull), "Unexpected carry!");
+	ASSERT((cy == 0ull), "Unexpected carry!");
 
 	/* Right-shift by B bits to get UMULH(q,Y) = ((Z << p) - (2k-1).Y) >> B: */
 	mi64_shrl(u,u,bits,len2,len2);
@@ -3705,14 +3705,14 @@ uint32	mi64_cvt_uint64_double(const uint64 x[], const uint64 y[], uint32 cy, uin
 	 int64 cyi, cyj, itmp, jtmp;
 	uint64 curr_re64, curr_im64, bitsm1 = FFT_MUL_BITS-1, basem1 = FFT_MUL_BASE-1;
 
-	ASSERT(HERE, len != 0, "mi64_cvt_uint64_double: zero-length array!");
+	ASSERT(len != 0, "mi64_cvt_uint64_double: zero-length array!");
 
 	/* Only constant base 2^16 is supported for this conversion at present: */
-	ASSERT(HERE, FFT_MUL_BITS == 16, "mi64_cvt_uint64_double: FFT_MUL_BITS != 16");
+	ASSERT(FFT_MUL_BITS == 16, "mi64_cvt_uint64_double: FFT_MUL_BITS != 16");
 
 	/* Redo the quicker checks of those done in util.c::check_nbits_in_types() */
-	ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_uint64_double: FFT_MUL_BASE not pure-integer!");
-	ASSERT(HERE, FFT_MUL_BASE < TWO54FLOAT, "mi64_cvt_uint64_double: FFT_MUL_BASE >= maximum allowed value of 2^54!");
+	ASSERT(DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_uint64_double: FFT_MUL_BASE not pure-integer!");
+	ASSERT(FFT_MUL_BASE < TWO54FLOAT, "mi64_cvt_uint64_double: FFT_MUL_BASE >= maximum allowed value of 2^54!");
 
 	/* As we extract each floating-point word, balance it and set
 	resulting carry into next FP word: */
@@ -3740,16 +3740,16 @@ uint32	mi64_cvt_uint64_double(const uint64 x[], const uint64 y[], uint32 cy, uin
 			a[jpad+1] = (double)(jtmp - (cyj<<FFT_MUL_BITS));
 		}
 	}
-	ASSERT(HERE, cyi <= 1 && cyj <= 1,"mi64_cvt_uint64_double: Output carry out of range!");
+	ASSERT(cyi <= 1 && cyj <= 1,"mi64_cvt_uint64_double: Output carry out of range!");
 #if 0
 	// It is desirable to not have the FP vector length exceed 4*len,
 	// so suppress any output carry by folding back into MS array element:
 	if(cyi) {
-		ASSERT(HERE, a[jpad  ] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!");
+		ASSERT(a[jpad  ] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!");
 		a[jpad  ] += FFT_MUL_BASE;
 	}
 	if(cyj) {
-		ASSERT(HERE, a[jpad+1] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!");
+		ASSERT(a[jpad+1] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!");
 		a[jpad+1] += FFT_MUL_BASE;
 	}
 printf("mi64_cvt_uint64_double: Final a[%u,%u] = %15.3f,%15.3f\n",jpad,jpad+1,a[jpad],a[jpad+1]);
@@ -3790,18 +3790,18 @@ uint32	mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[]
 	int64 cy_re, cy_im, itmp, jtmp;
 	uint64 curr_re64, curr_im64;
 
-	ASSERT(HERE, n != 0, "zero-length array!");
+	ASSERT(n != 0, "zero-length array!");
 
 	/* Redo the quicker checks of those done in util.c::check_nbits_in_types() */
-	ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "FFT_MUL_BASE not pure-integer!");
-	ASSERT(HERE, FFT_MUL_BASE < TWO54FLOAT, "FFT_MUL_BASE >= maximum allowed value of 2^54!");
+	ASSERT(DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "FFT_MUL_BASE not pure-integer!");
+	ASSERT(FFT_MUL_BASE < TWO54FLOAT, "FFT_MUL_BASE >= maximum allowed value of 2^54!");
 /* Obsolete, for historical reference only:
 	// Make sure MSW of Re(A[]) and Im(A[]) in the balanced-representation form are both >= 0:
 	// Re(A[]) stored in even terms:
 	for(i = 2*n-2; i >= 0; i-=2) {
 		j = i + ( (i >> DAT_BITS) << PAD_BITS );
 		if(a[j] != 0.0) {
-			ASSERT(HERE, a[j] > 0.0, "MSW(Re(A[])) < 0!");
+			ASSERT(a[j] > 0.0, "MSW(Re(A[])) < 0!");
 			break;
 		}
 	}
@@ -3809,7 +3809,7 @@ uint32	mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[]
 	for(i = 2*n-1; i >= 1; i-=2) {
 		j = i + ( (i >> DAT_BITS) << PAD_BITS );
 		if(a[j] != 0.0) {
-			ASSERT(HERE, a[j] > 0.0, "MSW(Im(A[])) < 0!");
+			ASSERT(a[j] > 0.0, "MSW(Im(A[])) < 0!");
 			break;
 		}
 	}
@@ -3827,9 +3827,9 @@ uint32	mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[]
 		j = i + ( (i >> DAT_BITS) << PAD_BITS );
 
 
-		itmp = (uint64)1<<curr_bits;		ASSERT(HERE, curr_bits < 64,"curr_bits < 64");
-		ASSERT(HERE, curr_re64 < itmp && curr_im64 < itmp,"curr_wd64 !< (1<<curr_bits)");
-		ASSERT(HERE, DNINT(a[j]) == a[j] && ABS(a[j]) < TWO54FLOAT, "a[j] not pure-integer or out of range!");
+		itmp = (uint64)1<<curr_bits;		ASSERT(curr_bits < 64,"curr_bits < 64");
+		ASSERT(curr_re64 < itmp && curr_im64 < itmp,"curr_wd64 !< (1<<curr_bits)");
+		ASSERT(DNINT(a[j]) == a[j] && ABS(a[j]) < TWO54FLOAT, "a[j] not pure-integer or out of range!");
 
 		itmp = (int64)a[j  ] + cy_re;	/* current digit in int64 form, subtracting any borrow from previous digit.	*/
 		if(itmp < 0) {	/* If current digit < 0, add the base and set carry = -1	*/
@@ -3846,8 +3846,8 @@ uint32	mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[]
 		} else {
 			cy_im = 0;
 		}
-		ASSERT(HERE, itmp >= 0 && jtmp >= 0,"itmp,jtmp must be nonnegative 0!");
-		ASSERT(HERE, (curr_re64>>curr_bits) == 0 && (curr_im64>>curr_bits) == 0,"(curr_wd64>>curr_bits) != 0!");
+		ASSERT(itmp >= 0 && jtmp >= 0,"itmp,jtmp must be nonnegative 0!");
+		ASSERT((curr_re64>>curr_bits) == 0 && (curr_im64>>curr_bits) == 0,"(curr_wd64>>curr_bits) != 0!");
 
 		/* Copy bits of the current residue word into the accumulator, starting
 		at the (curr_bits)th bit. The resulting total number of accumulated bits
@@ -3884,10 +3884,10 @@ uint32	mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[]
 		y[len++] = curr_im64;
 		nbits += curr_bits;
 	}
-//	printf("mi64_cvt_double_uint64: Final a[%u,%u] = %15.3f,%15.3f; x,y[%u] = %llu,%llu\n",j,j+1,a[j],a[j+1],len-1,x[len-1],y[len-1]);
-	ASSERT(HERE, nbits == n*FFT_MUL_BITS,"nbits == n*FFT_MUL_BASE!");
-	ASSERT(HERE, len == (n>>2)          ,"len should == n/4!");
-	ASSERT(HERE, ABS(cy_re) <= 1 && ABS(cy_im) <= 1,"Output carry out of range!");
+//	printf("mi64_cvt_double_uint64: Final a[%u,%u] = %15.3f,%15.3f; x,y[%u] = %" PRIu64 ",%" PRIu64 "\n",j,j+1,a[j],a[j+1],len-1,x[len-1],y[len-1]);
+	ASSERT(nbits == n*FFT_MUL_BITS,"nbits == n*FFT_MUL_BASE!");
+	ASSERT(len == (n>>2)          ,"len should == n/4!");
+	ASSERT(ABS(cy_re) <= 1 && ABS(cy_im) <= 1,"Output carry out of range!");
 	// Carries declared signed, but throw in casts of the 0 in the < compares to ensure signedness of these:
 	return ( (cy_im < (int64)0)*8 + (cy_im != 0ull)*4 + (cy_re < (int64)0)*2 + (cy_re != 0ull) );
 }
@@ -3898,15 +3898,15 @@ uint32	mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[]
 uint32 mi64_init_mers_or_ferm_modulus(uint64 exp, int modtype, uint64 mvec[])
 {
 	uint32 i,j;	// j = uint64 vector length
-	ASSERT(HERE, mvec != 0x0, "Null output-vector pointer!");
+	ASSERT(mvec != 0x0, "Null output-vector pointer!");
 	if(modtype == 0) {	// Mersenne, 2^exp - 1
-		ASSERT(HERE, isPRP64(exp), "Mersenne exponent must be prime!");
+		ASSERT(isPRP64(exp), "Mersenne exponent must be prime!");
 		j = (exp+63)>>6;
 		// Loop rather than call to mi64_set_eq_scalar here, since need to set all elts = -1:
 		for(i = 0; i < j; i++) { mvec[i] = -1ull; }
 		mvec[j-1] >>= 64-(exp&63);	// Leading word needs >> to leave just low exp%64 bits set
 	} else {	// Fermat, 2^exp + 1
-		ASSERT(HERE, exp < 64, "Max supported Fermat-number index = 63!");
+		ASSERT(exp < 64, "Max supported Fermat-number index = 63!");
 		j = ((1ull << exp)+63)>>6;
 		// j = uint64 vector length; init sans the leading '1' word, then increment prior to mi64_div
 		mi64_clear(mvec,j);
@@ -3928,7 +3928,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 {
 	const uint32 max_dim = 4096;
 	uint64 n[max_dim],result[max_dim];
-	ASSERT(HERE, len <= max_dim, "mi64_pprimeF: Required array length exceeds dimensioned maximum!");
+	ASSERT(len <= max_dim, "mi64_pprimeF: Required array length exceeds dimensioned maximum!");
 	mi64_set_eq(n, p, len);	mi64_sub_scalar(n, 1ull, n, len);	/* n = p - 1 */
 	mi64_scalar_modpow_lr(z, n, p, len, result);
 	return mi64_cmp_eq_scalar(result,1ull, len);
@@ -3962,7 +3962,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 	  #endif
 	  #if MI64_PRP_DBG
 	  if(dbg) {
-		printf("mi64_scalar_modpow_lr: %llu^%s (mod q = %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,q,len,0)]);
+		printf("mi64_scalar_modpow_lr: %" PRIu64 "^%s (mod q = %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,q,len,0)]);
 		printf("Using Montgomery-multiply remaindering.\n");
 	  }
 	  #endif
@@ -3970,13 +3970,13 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 			mi64_clear(c,len);	c[0] = a;
 			return;
 		}
-		ASSERT(HERE, b != 0x0 && c != 0x0, "Null input- or output-array pointer!");
+		ASSERT(b != 0x0 && c != 0x0, "Null input- or output-array pointer!");
 		// Working length = length of product of scalar powering-base and modulus vector;
 		// must not assume [len] reflects number nonzero limbs, i.e. thee might be 0-pads at high end:
-		wlen = mi64_getlen(q, len);		ASSERT(HERE, wlen > 0, "0-length modulus!");
+		wlen = mi64_getlen(q, len);		ASSERT(wlen > 0, "0-length modulus!");
 		// Increment working length if a*q overflows into the next-higher limb:
 		i64 = mi64_mul_scalar(q,a,prod,wlen);	wlen += (i64 != 0ull);
-		ASSERT(HERE, wlen <= max_dim, "mi64_modpow_lr: Required array length exceeds dimensioned maximum!");
+		ASSERT(wlen <= max_dim, "mi64_modpow_lr: Required array length exceeds dimensioned maximum!");
 		// Init writable local array n[] = q[], including 0-pad at top if a*q overflows len limbs
 		mi64_set_eq(n, q, len); if(i64) n[wlen-1] = 0ull;	// Use carryo4t i64 rather than (wlen > len) here, since wlen may be < len
 		wlen2 = wlen + wlen;
@@ -3985,8 +3985,8 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 		if(dbg) printf("Modulus q has %u limbs, a*q has %u limbs\n",mi64_getlen(q,len),wlen);
 	  #endif
 		nbits = wlen << 6;		log2_numbits = ceil(log(1.0*nbits)/log(2.0));
-		ASSERT(HERE, IS_ODD(n[0]), "Modulus must be odd for Montgomery-mod-based LR binary powering!");
-		if(len == 1) ASSERT(HERE, a < n[0], "Input base array must be properly normalized (mod q)!");
+		ASSERT(IS_ODD(n[0]), "Modulus must be odd for Montgomery-mod-based LR binary powering!");
+		if(len == 1) ASSERT(a < n[0], "Input base array must be properly normalized (mod q)!");
 		/*
 		Find modular inverse (mod 2^nbits) of w in preparation for modular multiply.
 		w must be odd for Montgomery-style modmul to work.
@@ -4015,7 +4015,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 		}
 		// Check the computed inverse:
 		mi64_mul_vector_lo_half(n, ninv, prod, wlen);
-		ASSERT(HERE, mi64_cmp_eq_scalar(prod, 1ull, wlen), "Bad Montmul inverse!");
+		ASSERT(mi64_cmp_eq_scalar(prod, 1ull, wlen), "Bad Montmul inverse!");
 	#if MI64_PRP_DBG
 		if(dbg) printf("qinv = %s\n", &cbuf[convert_mi64_base10_char(cbuf, ninv, wlen, 0)]);
 	#endif
@@ -4029,7 +4029,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 		j = mi64_leadz(b,len); start_index = (len<<6) - j;
 	  #if MI64_PRP_DBG
 	  if(dbg) {
-		printf("base a[] = %llu, start_bit = %d\n",a,start_index-2);
+		printf("base a[] = %" PRIu64 ", start_bit = %d\n",a,start_index-2);
 		printf("R*a (mod q) = %s\n",&cbuf[convert_mi64_base10_char(cbuf,c,wlen, 0)]);
 	  }
 	  #endif
@@ -4046,9 +4046,9 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 			// do bit-dependent mul-by-base here on the double-wide squaring output:
 		  #if !DO_N_MODSQUARES
 			if(mi64_test_bit(b,j)) {
-				i64 = mi64_mul_scalar(prod, a, prod, wlen2);	ASSERT(HERE, i64 == 0ull, "Unexpected carry out of a*x^2!");
+				i64 = mi64_mul_scalar(prod, a, prod, wlen2);	ASSERT(i64 == 0ull, "Unexpected carry out of a*x^2!");
 		      #if MI64_PRP_DBG
-				if(dbg) printf("*= %llu = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]);
+				if(dbg) printf("*= %" PRIu64 " = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]);
 		      #endif
 			}
 		  #endif	// endif !DO_N_MODSQUARES
@@ -4060,21 +4060,21 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 		  #endif
 			// If hi < lo, then calculate (hi-lo)+q = q-lo+hi < q; otherwise calculate hi-lo:
 			if(mi64_cmpult(hi,lo,wlen)) {
-				i64 = mi64_sub(hi,lo,lo,wlen);	ASSERT(HERE, i64, "Expected a borrow!");
-				i64 = mi64_add(n ,lo,c ,wlen);	ASSERT(HERE, i64, "Expected borrow/carry cancellation!");
+				i64 = mi64_sub(hi,lo,lo,wlen);	ASSERT(i64, "Expected a borrow!");
+				i64 = mi64_add(n ,lo,c ,wlen);	ASSERT(i64, "Expected borrow/carry cancellation!");
 			} else {
-				i64 = mi64_sub(hi,lo,c ,wlen);	ASSERT(HERE,!i64, "Unexpected borrow!");
+				i64 = mi64_sub(hi,lo,c ,wlen);	ASSERT(!i64, "Unexpected borrow!");
 			}
 		   #if MI64_PRP_DBG
 			if(dbg) printf("(mod q) = %s\n", &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)]);
-			if(dbg && !(j & 1023)) printf("At bit %d: Res64 = %016llX\n",j,c[0]);
+			if(dbg && !(j & 1023)) printf("At bit %d: Res64 = %016" PRIX64 "\n",j,c[0]);
 		   #endif
 		}
 		// Do a final Montmul-by-1 to remove the excess *R (mod q); hi = 0 here simplifies things:
 		mi64_mul_vector_lo_half( c,ninv,lo,wlen);
 		mi64_mul_vector_hi_half(lo,n   ,lo,wlen);
 		// (hi-lo)+q = q-lo+hi = q-lo:
-		i64 = mi64_sub(n,lo, c,len);	ASSERT(HERE,!i64, "Unxpected borrow!");
+		i64 = mi64_sub(n,lo, c,len);	ASSERT(!i64, "Unxpected borrow!");
 	#if MI64_PRP_DBG
 	  if(dbg) printf("retval = %s\n", &cbuf[convert_mi64_base10_char(cbuf, c, len, 0)]);
 	#endif
@@ -4098,25 +4098,25 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 	  #endif
 	  #if MI64_PRP_DBG
 	  if(dbg) {
-		printf("mi64_scalar_modpow_lr: %llu^%s (mod %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,b,len,0)]);
+		printf("mi64_scalar_modpow_lr: %" PRIu64 "^%s (mod %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,b,len,0)]);
 	  }
 	  #endif
 		if(!a) {	// a = 0; set result c[] = 0 and return
 			mi64_clear(c,len);
 			return;
 		}
-		ASSERT(HERE, b != 0x0 && c != 0x0, "Null input- or output-array pointer!");
-		ASSERT(HERE, len <= 1024, "mi64_modpow_lr: Max 1024 words allowed at present!");
+		ASSERT(b != 0x0 && c != 0x0, "Null input- or output-array pointer!");
+		ASSERT(len <= 1024, "mi64_modpow_lr: Max 1024 words allowed at present!");
 		mi64_set_eq_scalar(c, a, len);	// Init result-holding array c[0] = a
 		mi64_set_eq(npad	, n, len);	mi64_clear(npad+len, len);	// set npad = a[]; npad is zero-padded
 		// Working length = length of actual modulus vector:
-		wlen = mi64_getlen(n, len);		ASSERT(HERE, wlen > 0, "0-length array!");
-		ASSERT(HERE, mi64_cmpult(c, n, len), "Input base array must be properly normalized (mod n)!");
+		wlen = mi64_getlen(n, len);		ASSERT(wlen > 0, "0-length array!");
+		ASSERT(mi64_cmpult(c, n, len), "Input base array must be properly normalized (mod n)!");
 		// LR modpow:
 		j = leadz64(b[wlen-1]); start_index = (wlen<<6) - j;
 	  #if MI64_PRP_DBG
 	  if(dbg) {
-		printf("base a[] = %llu, start_bit = %d\n",a,start_index-1);
+		printf("base a[] = %" PRIu64 ", start_bit = %d\n",a,start_index-1);
 		printf("x0 = %s, len = %u\n", &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)], wlen);
 	  }
 	  #endif
@@ -4129,7 +4129,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 			mi64_sqr_vector(c, prod, wlen);		/* x^2 */
 		//	mi64_div(prod, npad, len2, len2, 0x0, prod);	*** Fails on F28 cofactor-PRP 3^nsquares (mod q) check; for x = 146715292687661855688^2 % q get 314605340220462438224, should = 240587464360836147143! ***
 			mi64_div_binary(prod, npad, len2, len2, 0x0,&lenq, prod);
-			ASSERT(HERE, mi64_getlen(prod, len2) <= len, "mi64_modpow_lr: (x^2)%p illegal length");
+			ASSERT(mi64_getlen(prod, len2) <= len, "mi64_modpow_lr: (x^2)%p illegal length");
 			mi64_set_eq(c, prod, len);	/* c = (c^2)%p */
 		  #if MI64_PRP_DBG
 			if(dbg) printf("j = %d: x^2 (mod n) = %s\n", j, &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)]);
@@ -4138,7 +4138,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 			if(mi64_test_bit(b,j)) {
 				prod[wlen] = mi64_mul_scalar(c, a, prod, wlen);
 		   #if MI64_PRP_DBG
-			if(dbg) printf("*= %llu = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]);
+			if(dbg) printf("*= %" PRIu64 " = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]);
 		   #endif
 				mi64_div_binary(prod, n, wlen+1, wlen, 0x0,0x0, c); 	// x = prod % p
 		   #if MI64_PRP_DBG
@@ -4146,7 +4146,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len)
 		   #endif
 			}
 		  #endif	// endif !DO_N_MODSQUARES
-			if(!(j & 1023)) printf("At bit %d: Res64 = %016llX\n",j,c[0]);
+			if(!(j & 1023)) printf("At bit %d: Res64 = %016" PRIX64 "\n",j,c[0]);
 		}
 	#if MI64_PRP_DBG
 	  if(dbg) printf("retval = %s\n", &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)]);
@@ -4205,10 +4205,10 @@ void mi64_vcvtuqq2pd(const uint64 a[], double b[])
 		: "cc","memory","rax","rbx","rcx","xmm0","xmm1","xmm31"	/* Clobbered registers */\
 	);
 	for(i = 0; i < 8; i++) {
-		ASSERT(HERE, b[i] == (double)a[i], "uint64->double conversion result differs from reference!");
+		ASSERT(b[i] == (double)a[i], "uint64->double conversion result differs from reference!");
 	}
 #else
-	ASSERT(HERE, 0,"mi64_vcvtuqq2pd requires build with AVX512 instruction set!\n");
+	ASSERT(0,"mi64_vcvtuqq2pd requires build with AVX512 instruction set!\n");
 #endif	// USE_AVX ?
 }
 
@@ -4224,10 +4224,10 @@ void mi64_vcvtpd2uqq(const double a[], uint64 b[])
 		: "cc","memory","rax","rbx","rcx","xmm0","xmm1","xmm31"	/* Clobbered registers */\
 	);
 	for(i = 0; i < 8; i++) {
-		ASSERT(HERE, (double)b[i] == a[i], "double->uint64 conversion result differs from reference!");
+		ASSERT((double)b[i] == a[i], "double->uint64 conversion result differs from reference!");
 	}
 #else
-	ASSERT(HERE, 0,"mi64_vcvt2pduqq requires build with AVX512 instruction set!\n");
+	ASSERT(0,"mi64_vcvt2pduqq requires build with AVX512 instruction set!\n");
 #endif	// USE_AVX ?
 }
 #endif	// __CUDA_ARCH__ ?
@@ -4686,14 +4686,14 @@ void mi64_modmul53_batch(const double a[], const double b[], const double m[], d
 			: "cc","memory","rax","rbx","rcx","rdx"	/* Clobbered registers */\
 		);
 		if(r[i] != rem64 && r[i] != im+rem64) {	// Allow for rem to be either in [-m/2,+m/2] or in [0,m)
-			printf("[%2u/%2u]: %16llu * %16llu / %16llu = %16llu[quo], %16llu[rem], DP rem = %16.0f\n",i,ndata,ia,ib,im,quo64,rem64, r[i]);
+			printf("[%2u/%2u]: %16" PRIu64 " * %16" PRIu64 " / %16" PRIu64 " = %16" PRIu64 "[quo], %16" PRIu64 "[rem], DP rem = %16.0f\n",i,ndata,ia,ib,im,quo64,rem64, r[i]);
 			if(++nerr > 1000) exit(0);
 		}
-	//	ASSERT(HERE, r[i] == rem64, "Modmul result differs from reference!");
+	//	ASSERT(r[i] == rem64, "Modmul result differs from reference!");
 	}
   #endif
 #else
-	ASSERT(HERE, 0,"mi64_modmul53_batch requires build with AVX2 instruction set!\n");
+	ASSERT(0,"mi64_modmul53_batch requires build with AVX2 instruction set!\n");
 #endif	// USE_AVX ?
 }
 #endif	// __CUDA_ARCH__ ?
@@ -4717,7 +4717,7 @@ Test harness code:
 			if(a > m) a %= m;
 			if(b > m) b %= m;
 			uint64 r = mi64_modmul64(a,b,m);
-		//	printf("(%llu * %llu) mod %llu = %llu\n",a,b,m,r);
+		//	printf("(%" PRIu64 " * %" PRIu64 ") mod %" PRIu64 " = %" PRIu64 "\n",a,b,m,r);
 		}
 	}
 */
@@ -4732,7 +4732,7 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m)
 	if(first_entry) {
 		unsigned short FPUCTRL;
 		__asm__ volatile ("fstcw %0" : "=m" (FPUCTRL) );
-		ASSERT(HERE, FPUCTRL == FPU_64CHOP, "This function requires user to set x87 FPU to truncatig-round mode!");
+		ASSERT(FPUCTRL == FPU_64CHOP, "This function requires user to set x87 FPU to truncatig-round mode!");
 		first_entry = FALSE;
 	}
 	// x86_64 modmul code using 64-bit FDIV for quotient - 2 versions, first one for 63-bit inputs, needs ~36 cycles on Core2:
@@ -4850,7 +4850,7 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m)
 		// for which the true q = 13545154436197203258, but FDIV produces 13545154436197203256.
 		// with this added check I got 10^11 sets of genuine 64-bit inputs to run sans errors:
 		if(r >= m) {
-		//	printf("a,b,m = %llu, %llu, %llu; FDIV-mod gives r = %llu\n",a,b,m,r);
+		//	printf("a,b,m = %" PRIu64 ", %" PRIu64 ", %" PRIu64 "; FDIV-mod gives r = %" PRIu64 "\n",a,b,m,r);
 			r -= m;
 		}
 		/* This was the code to normalize one of the inputs before I used the -= 2^63 trick, so you can see the usefulness of the latter:
@@ -4872,15 +4872,15 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m)
 	//     only explicitly store i' = i%2^64, high bit handled implicitly:
 	uint32 i;	uint64 mulh,twoi,diff = -1ull,ip = -m;	// Initial iterate = 2^65 - m = [1,-m] in base-2^64 twos-comp form. Init diff = UINT64_MAX
 	for(i = 0; i < 10; i++) {
-		twoi = ip + ip;					//ASSERT(HERE, twoi > ip  , "Unexpected overflow in 2*ip computation!");
-		mulh = twoi + __MULH64(ip,ip);	ASSERT(HERE, mulh > twoi, "Unexpected overflow in mulh summation!");
+		twoi = ip + ip;					//ASSERT(twoi > ip  , "Unexpected overflow in 2*ip computation!");
+		mulh = twoi + __MULH64(ip,ip);	ASSERT(mulh > twoi, "Unexpected overflow in mulh summation!");
 		mulh = __MULH64(m,mulh);
 		#error*** Mar 2021: Hit above assert with inputs (a=2, b=2, m=5000099); commenting it out, the iteration fails to converge ***
 		diff = ip - m - mulh;
 		ip += diff;
 		if(!diff) break;
 	}
-	ASSERT(HERE, !diff, "Barrett-modmul scaled inverse computation failed to converge!");
+	ASSERT(!diff, "Barrett-modmul scaled inverse computation failed to converge!");
 	uint64 lo,hi;
   #ifdef MUL_LOHI64_SUBROUTINE
 	MUL_LOHI64(a,b,&lo,&hi);
@@ -4958,7 +4958,7 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m)
 		 ,[__m] "g" (m)	\
 		: "cc","memory","rax","rbx","rcx","rdx"	/* Clobbered registers */\
 	);
-	ASSERT(HERE, r == i64, "Modmul result differs from reference!");
+	ASSERT(r == i64, "Modmul result differs from reference!");
 #endif
 
 	return r;
@@ -4978,11 +4978,11 @@ int mi64_div(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, uint6
 	uint32 xlen, ylen, max_len;
 	uint64 itmp64;
 	// Only the quotient array is optional:
-	ASSERT(HERE, lenX && lenY, "illegal 0 dimension!");
-	ASSERT(HERE, x && y, "At least one of X, Y is null!");
-	ASSERT(HERE, x != y, "X and Y arrays overlap!");
-	ASSERT(HERE, r != y, "Y and Rem arrays overlap!");
-	ASSERT(HERE, q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!");
+	ASSERT(lenX && lenY, "illegal 0 dimension!");
+	ASSERT(x && y, "At least one of X, Y is null!");
+	ASSERT(x != y, "X and Y arrays overlap!");
+	ASSERT(r != y, "Y and Rem arrays overlap!");
+	ASSERT(q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!");
 
 	/* Init Q = 0; don't do similarly for R since we allow X and R to point to same array: */
 	if(q && (q != x)) {
@@ -4991,7 +4991,7 @@ int mi64_div(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, uint6
 	/* And now find the actual lengths of the divide operands and use those for the computation: */
 	xlen = mi64_getlen(x, lenX);
 	ylen = mi64_getlen(y, lenY);
-	ASSERT(HERE, ylen != 0, "divide by 0!");
+	ASSERT(ylen != 0, "divide by 0!");
 
 	// If x < y, no modding needed - copy x into remainder and set quotient = 0:
 	max_len = MAX(xlen, ylen);
@@ -5048,12 +5048,12 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 	static uint64 *scratch = 0x0;	// "base pointer" for local storage shared by all of the above subarrays
 	static uint64 *hi = 0x0, *v = 0x0, *w = 0x0;	// These are treated as vars (cost-offsets of the above ptrs),
 													// hence non-static. *** MUST RE-INIT ON EACH ENTRY ***
-	ASSERT(HERE, lenX && lenY, "illegal 0 dimension!");
-	ASSERT(HERE, (lenY > 1) || (y[0] > 0), "Divide by zero!");
-	ASSERT(HERE, (x && y) && (x != y), "Bad x or y array!");
-	ASSERT(HERE, (q == 0x0 || q != r), "Quotient and remainder arrays must not overlap!");	// q may be 0x0, but must not overlap r
+	ASSERT(lenX && lenY, "illegal 0 dimension!");
+	ASSERT((lenY > 1) || (y[0] > 0), "Divide by zero!");
+	ASSERT((x && y) && (x != y), "Bad x or y array!");
+	ASSERT((q == 0x0 || q != r), "Quotient and remainder arrays must not overlap!");	// q may be 0x0, but must not overlap r
 									// To-do: Change from a simple pointers-coincide to an actual arrays-overlap check.
-	lenD = mi64_getlen(y, lenY);	ASSERT(HERE, lenD != 0, "0-length divisor!");
+	lenD = mi64_getlen(y, lenY);	ASSERT(lenD != 0, "0-length divisor!");
 
 	// Alloc of the repeated-div-associated statics handled separately from other local storage:
 	if(modDdim < lenD) {
@@ -5063,9 +5063,9 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			free((void *)mod_inv_save);	mod_inv_save = 0x0;
 			free((void *)basepow_save);	basepow_save = 0x0;
 		}
-		modulus_save = (uint64 *)calloc((modDdim), sizeof(uint64));	ASSERT(HERE, modulus_save != 0x0, "alloc fail!");
-		mod_inv_save = (uint64 *)calloc((modDdim), sizeof(uint64));	ASSERT(HERE, mod_inv_save != 0x0, "alloc fail!");
-		basepow_save = (uint64 *)calloc((modDdim), sizeof(uint64));	ASSERT(HERE, basepow_save != 0x0, "alloc fail!");
+		modulus_save = (uint64 *)calloc((modDdim), sizeof(uint64));	ASSERT(modulus_save != 0x0, "alloc fail!");
+		mod_inv_save = (uint64 *)calloc((modDdim), sizeof(uint64));	ASSERT(mod_inv_save != 0x0, "alloc fail!");
+		basepow_save = (uint64 *)calloc((modDdim), sizeof(uint64));	ASSERT(basepow_save != 0x0, "alloc fail!");
 		mod_repeat = FALSE;
 	}
 
@@ -5078,10 +5078,10 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 	//	printf("x = %s\n", &str_10k[__convert_mi64_base10_char(str_10k, 10<<10, x, lenX, 0)]);
 		printf("y = %s\n", &s0[convert_mi64_base10_char(s0, y, lenD, 0)]);	// Leave length-check off this so if y too large for print we assert right here
 		// Compute result using slow binary-div algo, use that as reference:
-		qref   = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(HERE, qref   != 0x0, "alloc fail!");
-		rref   = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(HERE, rref   != 0x0, "alloc fail!");
-		lo_dbg = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(HERE, lo_dbg != 0x0, "alloc fail!");
-		hi_dbg = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(HERE, hi_dbg != 0x0, "alloc fail!");
+		qref   = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(qref   != 0x0, "alloc fail!");
+		rref   = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(rref   != 0x0, "alloc fail!");
+		lo_dbg = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(lo_dbg != 0x0, "alloc fail!");
+		hi_dbg = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(hi_dbg != 0x0, "alloc fail!");
 		mi64_set_eq(lo_dbg,x,lenX);
 		mi64_set_eq(hi_dbg,y,lenY);
 		mi64_div_binary(lo_dbg,hi_dbg,lenX,lenY,qref,&lenQ,rref);
@@ -5095,7 +5095,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 		if(vsave) {
 			free((void *)vsave);	vsave = 0x0;
 		}
-		vsave = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(HERE, vsave != 0x0, "alloc fail!");
+		vsave = (uint64 *)calloc((lenX), sizeof(uint64));	ASSERT(vsave != 0x0, "alloc fail!");
 	}
 	if(lenD > lens) {
 		lens = lenD;
@@ -5103,7 +5103,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			free((void *)scratch);	scratch = yinv = cy = tmp = itmp = lo = hi = w = rem_save = 0x0;
 		}
 		/* (re)Allocate the needed auxiliary storage: */
-		scratch = (uint64 *)calloc((lenD*8), sizeof(uint64));	ASSERT(HERE, scratch != 0x0, "alloc fail!");
+		scratch = (uint64 *)calloc((lenD*8), sizeof(uint64));	ASSERT(scratch != 0x0, "alloc fail!");
 	}
 	// These ptrs just point to various disjoint length-lenD sections of the shared local-storage chunk;
 	// since some of them are treated as vars, reset 'em all on each entry, as well as re-zeroing the whole memblock:
@@ -5157,8 +5157,8 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 					nc++; mi64_sub(r,y,r,lenX);	++itmp64;	// Need to incr quotient by 1 to account for extra sub-y
 				}
 			}
-			ASSERT(HERE, nc < ncmax, "Unexpectedly large number of corrections needed for floating-double quotient!");
-			ASSERT(HERE, mi64_cmpult(r, y, lenX), "Remainder should be < modulus!");
+			ASSERT(nc < ncmax, "Unexpectedly large number of corrections needed for floating-double quotient!");
+			ASSERT(mi64_cmpult(r, y, lenX), "Remainder should be < modulus!");
 			// At this point are done with x, so set low word of quotient array and clear rest:
 			if(q) {
 				mi64_clear(q, lenX);	q[0] = itmp64;
@@ -5208,7 +5208,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 	}
 	hi = lo + lenS;	// *** lo:hi pointer pairs must be offset by amount reflecting #words in right-justified modulus! ***
   #if MI64_DIV_MONT
-	if(dbg)printf("mi64_div_mont: setting hi = lo + lenS = %llX\n",(uint64)hi);
+	if(dbg)printf("mi64_div_mont: setting hi = lo + lenS = %" PRIX64 "\n",(uint64)hi);
   #endif
 
 	// If single-word odd-component divisor, use specialized single-word-divisor version:
@@ -5270,10 +5270,10 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			Init yinv = 3*w ^ 2. This formula returns the correct bottom 5 bits of yinv,
 			and we double the number of correct bits on each of the subsequent iterations.
 			*/
-			ASSERT(HERE, (w[0] & (uint64)1) == 1, "modulus must be odd!");
+			ASSERT((w[0] & (uint64)1) == 1, "modulus must be odd!");
 			ybits = lenS << 6;
 			log2_numbits = ceil(log(1.0*ybits)/log(2.0));
-			ASSERT(HERE, (w[0] & (uint64)1) == 1, "w must be odd!");
+			ASSERT((w[0] & (uint64)1) == 1, "w must be odd!");
 			mi64_clear(yinv, lenS);
 			yinv[0] = (w[0] + w[0] + w[0]) ^ (uint64)2;
 
@@ -5301,7 +5301,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			for(j = 6; j < log2_numbits; j++, i <<= 1) {
 				mi64_mul_vector_lo_half(w, yinv,tmp, lenS);
 				mi64_nega              (tmp,tmp, lenS);
-				bw = mi64_add_scalar(tmp, 2ull,tmp, lenS);	ASSERT(HERE, !bw, "");
+				bw = mi64_add_scalar(tmp, 2ull,tmp, lenS);	ASSERT(!bw, "");
 				mi64_mul_vector_lo_half(yinv,tmp, yinv, lenS);
 			}
 			// Save inverse in case next call uses same modulus:
@@ -5310,7 +5310,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 
 		// Check the computed inverse:
 		mi64_mul_vector_lo_half(w, yinv, tmp, lenS);
-		ASSERT(HERE, mi64_cmp_eq_scalar(tmp, 1ull, lenS), "Bad Montmul inverse!");
+		ASSERT(mi64_cmp_eq_scalar(tmp, 1ull, lenS), "Bad Montmul inverse!");
 	#if MI64_DIV_MONT
 		if(dbg)printf("yinv = %s\n", &s0[convert_mi64_base10_char(s0, yinv, lenS, 0)]);
 	#endif
@@ -5337,7 +5337,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 				mi64_set_eq(itmp,tmp,lenS);	// itmp = tmp
 			}
 		#if MI64_DIV_MONT
-			if(dbg)printf("v-cy = %s, bw = %llu\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw);
+			if(dbg)printf("v-cy = %s, bw = %" PRIu64 "\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw);
 		#endif
 
 			// Now do the Montgomery mod: cy = umulh( w, mull(tmp, yinv) );
@@ -5346,7 +5346,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			if(dbg)printf("MULL = %s\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)]);
 		#endif
 			// bw = 0 or 1, but may propagate all the way into high word:
-			ASSERT(HERE, 0ull == mi64_add_scalar(tmp,bw, tmp, lenS), "tmp += bw has carryout!");
+			ASSERT(0ull == mi64_add_scalar(tmp,bw, tmp, lenS), "tmp += bw has carryout!");
 			// Do double-wide product. Fast-divisibility test needs just high half (stored in hi); low half (lo) useful to extract true-mod
 			mi64_mul_vector(tmp,lenS,w,lenS,lo, (uint32*)&j);	// lo:hi = MUL_LOHI(q, tmp)
 
@@ -5359,7 +5359,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			#if MI64_DIV_MONT
 				if(dbg)printf("itmp = %s\n", &s0[convert_mi64_base10_char(s0, itmp, lenS, 0)]);
 			#endif
-				ASSERT(HERE, 0, "Low-half product check mismatch!");
+				ASSERT(0, "Low-half product check mismatch!");
 			}
 		}
 
@@ -5423,7 +5423,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 					// current power p after each halving step here to account for that:
 					p = (p >> 1) + 1;
 				}
-				ASSERT(HERE, j <= 32, "Need 64-bit bitstring!");
+				ASSERT(j <= 32, "Need 64-bit bitstring!");
 				/*
 				Now do the needed powering. We always start with p = 2 and M-square that to get p = 3:
 				*/
@@ -5441,7 +5441,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 					MONT_SQR_N(itmp,lo,w,yinv,tmp,lenS);
 		//	printf("B^5 mod q = %s\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)]);
 				} else {
-					ASSERT(HERE, 0,"Bad starting value for power p!");
+					ASSERT(0,"Bad starting value for power p!");
 				}
 				for(i = j-1; i >= 0; i--) {
 					if(BIT_TEST(n,i)) {
@@ -5494,13 +5494,13 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 				// Now do the Montgomery mod: cy = umulh( y, mull(tmp, yinv) );
 				mi64_mul_vector_lo_half(tmp,yinv,tmp, lenS);	// tmp = tmp*yinv + bw;
 			#if MI64_DIV_MONT
-				if(dbg)printf("tmp*yinv = %s, bw = %llu\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw);
+				if(dbg)printf("tmp*yinv = %s, bw = %" PRIu64 "\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw);
 			#endif
 				// Do double-wide product. Fast-divisibility test needs just high half (stored in hi); low half (lo) useful to extract true-mod
 				mi64_mul_vector(tmp,lenS,w,lenS,lo, (uint32*)&j);	// lo:hi = MUL_LOHI(q, tmp); cy is in hi half
 				// (cy + bw); Since bw = 0 or 1, check that bw=1 does not propagate is (sum >= bw) in 1-limb form.
 				// Apr 2022: in more-general multiword case, check that hi[] + bw does not yield a carryout:
-				itmp64 = mi64_add_scalar(hi,bw, hi, lenS);	ASSERT(HERE, itmp64 == 0ull, "mi64_div_mont(): Unexpected carryout from (hi[] + bw) in quotient loop!");
+				itmp64 = mi64_add_scalar(hi,bw, hi, lenS);	ASSERT(itmp64 == 0ull, "mi64_div_mont(): Unexpected carryout from (hi[] + bw) in quotient loop!");
 			#if MI64_DIV_MONT
 				if(dbg)printf("  lo = %s\n", &s0[convert_mi64_base10_char(s0,   lo, lenS, 0)]);
 				if(dbg)printf("  hi = %s\n", &s0[convert_mi64_base10_char(s0,   hi, lenS, 0)]);
@@ -5509,7 +5509,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 				#if MI64_DIV_MONT
 					printf("itmp = %s\n", &s0[convert_mi64_base10_char(s0, itmp, lenS, 0)]);
 				#endif
-					ASSERT(HERE, 0, "Low-half product check mismatch!");
+					ASSERT(0, "Low-half product check mismatch!");
 				}
 				mi64_set_eq(q+i,tmp,lenS);	// Equivalent to the y[i] = tmp step of the scalar routine
 			}
@@ -5518,11 +5518,11 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 			if(j) {
 				// Check cy = {v[i],v[i+1],...,v[lenX-1],0,...,0}
 				if(!mi64_cmp_eq(hi,v+i,j)) {
-					ASSERT(HERE, mi64_cmp_eq(hi,v+i,j), "cy check!");
+					ASSERT(mi64_cmp_eq(hi,v+i,j), "cy check!");
 				}
 				mi64_clear(q+i,j);	// Do after above check since v may == q
 				for(i = j; i < lenS; i++) {
-					ASSERT(HERE, hi[i] == 0ull, "cy check!");
+					ASSERT(hi[i] == 0ull, "cy check!");
 				}
 			}
 		#if MI64_DIV_MONT
@@ -5558,12 +5558,12 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY,
 		if(!mi64_cmp_eq(rref,r,lenY)) {
 			printf("rref = %s\n", &s0[convert_mi64_base10_char(s0, rref, lenD, 0)]);
 			printf("rewm = %s\n", &s0[convert_mi64_base10_char(s0, r   , lenD, 0)]);
-			ASSERT(HERE, 0, "bzzt!\n");
+			ASSERT(0, "bzzt!\n");
 		}
 		if(!mi64_cmp_eq(qref,q,lenX)) {
 			printf("qref = %s\n", &s0[convert_mi64_base10_char(s0, qref, lenX, 0)]);
 			printf("qewm = %s\n", &s0[convert_mi64_base10_char(s0, q   , lenX, 0)]);
-			ASSERT(HERE, 0, "bzzt!\n");
+			ASSERT(0, "bzzt!\n");
 		}
 
 		free((void *)qref); qref = 0x0;
@@ -5610,12 +5610,12 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 	if(dbg)
 		printf("mi64_div_binary: x = %s, y = %s\n",&s0[convert_mi64_base10_char(s0, x, lenX, 0)],&s1[convert_mi64_base10_char(s1, y, lenY, 0)]);
   #endif
-	ASSERT(HERE, lenX && lenY, "illegal 0 dimension!");
-	ASSERT(HERE, x && y, "At least one of X, Y is null!");
-	ASSERT(HERE, x != y, "X and Y arrays overlap!");
-	ASSERT(HERE, r != y, "Y and Rem arrays overlap!");
-	ASSERT(HERE, q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!");
-	if(q) ASSERT(HERE, lenQ != 0x0, "If quotient requested, quotient-length pointer must be provided!");
+	ASSERT(lenX && lenY, "illegal 0 dimension!");
+	ASSERT(x && y, "At least one of X, Y is null!");
+	ASSERT(x != y, "X and Y arrays overlap!");
+	ASSERT(r != y, "Y and Rem arrays overlap!");
+	ASSERT(q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!");
+	if(q) ASSERT(lenQ != 0x0, "If quotient requested, quotient-length pointer must be provided!");
 	/* Init Q = 0; don't do similarly for R since we allow X and R to point to same array:
 	Jan 2018: No! User may feed qvec only suficient in size to hold ACTUAL QUOTIENT, based on an estimate of the latter -
 	I hit "EXC_BAD_ACCESS, Could not access memory" in a case with xlen = ylen = 2^20, qlen = 1, where I simply fed a
@@ -5629,7 +5629,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 	/* And now find the actual lengths of the divide operands and use those for the computation: */
 	xlen = mi64_getlen(x, lenX);
 	ylen = mi64_getlen(y, lenY);
-	ASSERT(HERE, ylen != 0, "divide by 0!");
+	ASSERT(ylen != 0, "divide by 0!");
 
 	// Allocate the needed auxiliary storage - the 2 yloc = ... / mi64_set_eq calls below copy (lenX + lenY) limbs into scratch, so alloc at least that much:
 	if(lens < (lenX + lenY)) {
@@ -5646,9 +5646,9 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 		Setting said breakpoint is useless, can't see function context when hit. Instead try setting min-size  = 1024 in lens = ... .
 		***/
 	#if 1
-		scratch = (uint64 *)realloc(scratch, lens*sizeof(uint64));	ASSERT(HERE, scratch != 0x0, "alloc fail!");
+		scratch = (uint64 *)realloc(scratch, lens*sizeof(uint64));	ASSERT(scratch != 0x0, "alloc fail!");
 	#else
-		tmp_ptr = (uint64 *)malloc(lens*sizeof(uint64));	ASSERT(HERE, tmp_ptr != 0x0, "alloc fail!");
+		tmp_ptr = (uint64 *)malloc(lens*sizeof(uint64));	ASSERT(tmp_ptr != 0x0, "alloc fail!");
 		free(scratch); scratch = tmp_ptr;
 	#endif
 	}
@@ -5674,7 +5674,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 	lz_x = mi64_leadz(xloc, max_len);
 	lz_y = mi64_leadz(yloc, max_len);
 	nshift = lz_y - lz_x;
-	ASSERT(HERE, nshift >= 0, "nshift < 0");
+	ASSERT(nshift >= 0, "nshift < 0");
 	i = (nshift+63)>>6;
 	if(q) {
 		mi64_clear(q, i);	*lenQ = i;
@@ -5686,9 +5686,9 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 		if(dbg)printf("I = %3d: r = %s, yshift = %s\n", i,&s0[convert_mi64_base10_char(s0, xloc, max_len, 0)],&s1[convert_mi64_base10_char(s1, yloc, max_len, 0)]);
 	#endif
 		if(mi64_cmpuge(xloc, yloc, max_len)) {
-			ASSERT(HERE, xlen == max_len,"xlen != max_len");
+			ASSERT(xlen == max_len,"xlen != max_len");
 			mi64_sub(xloc, yloc, xloc, max_len);	/* r -= yshift */
-			ASSERT(HERE, mi64_cmpult(xloc, yloc, max_len),"r >= yshift");
+			ASSERT(mi64_cmpult(xloc, yloc, max_len),"r >= yshift");
 			xlen = mi64_getlen(xloc, max_len);
 			if(q) {
 				mi64_set_bit(q,i,*lenQ,1);
@@ -5702,7 +5702,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 	}
 	// Remainder in xloc - do some sanity checks prior to copying into r[]:
 	xlen = mi64_getlen(xloc, lenX);
-	ASSERT(HERE, xlen <= ylen && mi64_cmpugt(y,xloc,ylen), "Remainder should be < modulus!");
+	ASSERT(xlen <= ylen && mi64_cmpugt(y,xloc,ylen), "Remainder should be < modulus!");
 	if(r != 0x0) {
 		mi64_set_eq(r, xloc, ylen);
 		if(x == r)	// If x == r, zero the leading (lenX-lenR) limbs of r prior to return:
@@ -5711,7 +5711,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY
 			mi64_clear(r+ylen,lenY-ylen);
 	}
 	/* Final value of yloc is unchanged from its (unshifted) starting value == y */
-	ASSERT(HERE, mi64_cmp_eq(yloc,y,ylen), "Final value of y-copy differs from original!");
+	ASSERT(mi64_cmp_eq(yloc,y,ylen), "Final value of y-copy differs from original!");
   #if MI64_DIV_DBG
 	if(dbg) {
 		if(q)printf("mi64_div_binary: quotient  = %s\n",&s0[convert_mi64_base10_char(s0, q, lenX, 0)]);
@@ -5739,7 +5739,7 @@ int mi64_is_div_by_scalar32(const uint32 x[], uint32 q, uint32 len)
 {
 	uint32 i,j,nshift,dlen,qinv,tmp,cy;
 
-	ASSERT(HERE, q > 0, "mi64_is_div_by_scalar32: 0 modulus!");
+	ASSERT(q > 0, "mi64_is_div_by_scalar32: 0 modulus!");
 	if(q == 1) return TRUE;
 	if(len == 0) return TRUE;
 
@@ -5775,7 +5775,7 @@ int		mi64_is_div_by_scalar32p(const uint32 x[], uint32 q, uint32 qinv, uint32 le
 {
 	uint32 i,dlen,tmp,cy;
 
-	ASSERT(HERE, qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");
+	ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");
 	cy = (uint32)0;
 	dlen = len+len;	/* Since are processing a uint64 array cast to uint32[], double the #words parameter */
 	for(i = 0; i < dlen; ++i) {
@@ -5808,7 +5808,7 @@ int		mi64_is_div_by_scalar32p_x8(
 	uint32 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7,cy0,cy1,cy2,cy3,cy4,cy5,cy6,cy7;
 	cy0 = cy1 = cy2 = cy3 = cy4 = cy5 = cy6 = cy7 = (uint32)0;
 
-	ASSERT(HERE, qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");
+	ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");
 
 	tmp0 = a[0] * qinv;
 	tmp1 = b[0] * qinv;
@@ -5894,7 +5894,7 @@ uint32	mi64_is_div_by_scalar32_x4(const uint32 x[], uint32 q0, uint32 q1, uint32
 	uint32 retval=0,dlen = len+len, qinv0,qinv1,qinv2,qinv3,tmp0,tmp1,tmp2,tmp3,cy0,cy1,cy2,cy3;
 	uint32 xcur,trailx;
 
-	ASSERT(HERE, q0 && q1 && q2 && q3, "mi64_is_div_by_scalar32_x4: 0 modulus!");
+	ASSERT(q0 && q1 && q2 && q3, "mi64_is_div_by_scalar32_x4: 0 modulus!");
 	if(q0 + q1 + q2 + q3 == 4) return TRUE;
 	if(len == 0) return TRUE;
 
@@ -5967,7 +5967,7 @@ uint32	mi64_is_div_by_scalar32_x8(const uint32 x[], uint32 q0, uint32 q1, uint32
 	uint32 retval=0,dlen = len+len, qinv0,qinv1,qinv2,qinv3,qinv4,qinv5,qinv6,qinv7,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7,cy0,cy1,cy2,cy3,cy4,cy5,cy6,cy7;
 	uint32 xcur,trailx;
 
-	ASSERT(HERE, q0 && q1 && q2 && q3 && q4 && q5 && q6 && q7, "mi64_is_div_by_scalar32_x8: 0 modulus!");
+	ASSERT(q0 && q1 && q2 && q3 && q4 && q5 && q6 && q7, "mi64_is_div_by_scalar32_x8: 0 modulus!");
 	if(q0 + q1 + q2 + q3 + q4 + q5 + q6 + q7 == 8) return TRUE;
 	if(len == 0) return TRUE;
 
@@ -6085,10 +6085,10 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 		if(itmp64 > q) {
 			// This check allows us to differentiate between incorrect upward-rounded and (rarer) downward-rounded cases:
 			if(DNINT(fquo) == (double)rem64) {	// Incorrect   upward-rounded, e.g. fquo = 1084809392143.0001, exact = 1084809392142.999...
-			//	printf("%sA: q = %llu < itmp64 = (int64)%lld, fquo = %20.4f, (double)rem64 = %20.4f\n",func,q,(int64)itmp64, fquo, (double)rem64);
+			//	printf("%sA: q = %" PRIu64 " < itmp64 = (int64)%" PRId64 ", fquo = %20.4f, (double)rem64 = %20.4f\n",func,q,(int64)itmp64, fquo, (double)rem64);
 				itmp64 += q;
 			} else {							// Incorrect downward-rounded, e.g. fquo = 7344640876302.9990, exact = 7344640876303.0000002...
-			//	printf("%sB: q = %llu < itmp64 = (int64)%lld, fquo = %20.4f *** Bad Downward ***\n",func,q,(int64)itmp64, fquo);
+			//	printf("%sB: q = %" PRIu64 " < itmp64 = (int64)%" PRId64 ", fquo = %20.4f *** Bad Downward ***\n",func,q,(int64)itmp64, fquo);
 				itmp64 -= q;
 			}
 		}
@@ -6134,7 +6134,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 
 		// Floating-point computation of 2^96 % q not 100% reliable - this pure-int code is our safety net:
 		if(itmp64 > q) {
-			printf("Error correction failed: itmp64 = (int64)%lld, q = %llu [lq(q) = %6.4f]\n",(int64)itmp64,q,log(q)/log(2));
+			printf("Error correction failed: itmp64 = (int64)%" PRId64 ", q = %" PRIu64 " [lq(q) = %6.4f]\n",(int64)itmp64,q,log(q)/log(2));
 			// In such cases re-do using the slower but bulletproof pure-integer method.
 			// Use mod-doublings to get 2^68 (mod q), followed by 3 MONT_SQR64:
 			itmp64 = 0x8000000000000000ull % q;	// 2^63 (mod q)
@@ -6147,7 +6147,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 			MONT_SQR64(itmp64,q,qinv,itmp64);	// 2^(2*68-64) == 2^72 (mod q)
 			MONT_SQR64(itmp64,q,qinv,itmp64);	// 2^(2*72-64) == 2^80 (mod q)
 			MONT_SQR64(itmp64,q,qinv,itmp64);	// 2^(2*80-64) == 2^96 (mod q)
-			ASSERT(HERE, itmp64 < q, "Pure-integer computation of 2^96 mod q fails!");
+			ASSERT(itmp64 < q, "Pure-integer computation of 2^96 mod q fails!");
 		}
 
 	} else if(q >> 32)	{	// q in [2^32,2^48)
@@ -6156,7 +6156,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 		itmp64 = 0x1000000000000000ull % q;	// 2^60 (mod q)
 		MONT_MUL48(itmp64,itmp64,q,qinv,itmp64);	// 2^(2*60-48) == 2^72 (mod q)
 		MONT_MUL48(itmp64,itmp64,q,qinv,itmp64);	// 2^(2*72-48) == 2^96 (mod q)
-		ASSERT(HERE, itmp64 < q, "Pure-integer computation of 2^96 mod q fails!");
+		ASSERT(itmp64 < q, "Pure-integer computation of 2^96 mod q fails!");
 
 	} else {	// q < 2^32
 
@@ -6169,7 +6169,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 		itmp32 -= (-(q32 < itmp32) & q32);	// If 2*itmp32 > q, subtract q
 											// itmp32 = 2^64 (mod q)
 		MONT_MUL32(itmp32,itmp32,q32,qinv32,itmp32);	// 2^(2*64-32) == 2^96 (mod q)
-		ASSERT(HERE, itmp32 < q32, "Pure-integer computation of 2^96 mod q fails!");
+		ASSERT(itmp32 < q32, "Pure-integer computation of 2^96 mod q fails!");
 		itmp64 = itmp32;	// promote to 64-bit
 	}
 
@@ -6177,7 +6177,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 	MONT_SQR64(itmp64,q,qinv,rem64);
 
 #if MI64_RAD_POW64_DBG
-	if(dbg)printf("B^2 mod q = %20llu\n",rem64);
+	if(dbg)printf("B^2 mod q = %20" PRIu64 "\n",rem64);
 #endif
 
 	/* rem64 holds B^2 mod q - Now compute sequence of powers needed to obtain B^len mod q via Montgomery-muls: */
@@ -6200,7 +6200,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 		} else if(p == 5) {
 			MONT_SQR64(itmp64,q,qinv,rem64);
 		} else {
-			ASSERT(HERE, 0,"Bad starting value for power p!");
+			ASSERT(0,"Bad starting value for power p!");
 		}
 		for(i = j-1; i >= 0; i--) {
 			if(BIT_TEST(bmap,i)) {
@@ -6213,7 +6213,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n)
 		}
 	}
 #if MI64_RAD_POW64_DBG
-	if(dbg && p > 2)printf("B^%u mod q = %20llu\n",n,rem64);
+	if(dbg && p > 2)printf("B^%u mod q = %20" PRIu64 "\n",n,rem64);
 #endif
 	return rem64;
 }
@@ -6229,7 +6229,7 @@ int mi64_is_div_by_scalar64(const uint64 x[], uint64 q, uint32 len)
 	uint32 i,nshift;
 	uint64 qinv,cy;
 
-	ASSERT(HERE, q > 0, "mi64_is_div_by_scalar64: 0 modulus!");
+	ASSERT(q > 0, "mi64_is_div_by_scalar64: 0 modulus!");
 	if(q == 1) return TRUE;
 	if(len == 0) return TRUE;
 
@@ -6346,9 +6346,9 @@ int mi64_is_div_by_scalar64_x4(const uint64 x[], uint64 q0, uint64 q1, uint64 q2
 	uint32 nshift0,nshift1,nshift2,nshift3;
 	uint64 qinv0,qinv1,qinv2,qinv3,cy0,cy1,cy2,cy3;
 
-	ASSERT(HERE, (len == 0), "0 length!");
+	ASSERT((len == 0), "0 length!");
 	trailx = trailz64(x[0]);
-	ASSERT(HERE, trailx < 64, "0 low word!");
+	ASSERT(trailx < 64, "0 low word!");
 
 	/* q must be odd for Montgomery-style modmul to work, so first shift off any low 0s: */
 	nshift0 = trailz64(q0);
@@ -6360,8 +6360,8 @@ int mi64_is_div_by_scalar64_x4(const uint64 x[], uint64 q0, uint64 q1, uint64 q2
 	q1 >>= nshift1;
 	q2 >>= nshift2;
 	q3 >>= nshift3;
-	ASSERT(HERE, q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!");
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
+	ASSERT(q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
 
 	qinv0 = (q0+q0+q0) ^ (uint64)2;
 	qinv1 = (q1+q1+q1) ^ (uint64)2;
@@ -6444,7 +6444,7 @@ int mi64_is_div_by_scalar64_x4(const uint64 x[], uint64 q0, uint64 q1, uint64 q2
 #endif
 
 #if MI64_ISDIV_X4_DBG
-	if(dbg)printf("4-way carryouts: cy0-3 = %20llu, %20llu, %20llu, %20llu\n",cy0,cy1,cy2,cy3);
+	if(dbg)printf("4-way carryouts: cy0-3 = %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 "\n",cy0,cy1,cy2,cy3);
 #endif
 	retval += ((cy0 == 0) && (nshift0 <= trailx));
 	retval += ((cy1 == 0) && (nshift1 <= trailx)) << 1;
@@ -6469,13 +6469,13 @@ int mi64_is_div_by_scalar64_u2(const uint64 x[], uint64 q, uint32 len)
 	uint32 i,len2 = (len>>1),nshift;
 	uint64 qinv,cy0,cy1,rpow;
 
-	ASSERT(HERE, q > 0, "mi64_is_div_by_scalar64: 0 modulus!");
+	ASSERT(q > 0, "mi64_is_div_by_scalar64: 0 modulus!");
 	if(q == 1) return TRUE;
 	if(len == 0) return TRUE;
-	ASSERT(HERE, (len&1) == 0, "odd length!");
+	ASSERT((len&1) == 0, "odd length!");
 	/* q must be odd for Montgomery-style modmul to work, so first shift off any low 0s: */
 	nshift = trailz64(q);
-ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!");
+ASSERT(!nshift, "2-way folded ISDIV requires odd q!");
 	if(nshift) {
 		if(trailz64(x[0]) < nshift) return FALSE;
 		q >>= nshift;
@@ -6539,7 +6539,7 @@ ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!");
 #endif
 
 #if MI64_ISDIV_U2_DBG
-	if(dbg)printf("Half-length carryouts: cy0 = %20llu, cy1 = %20llu\n",cy0,cy1);
+	if(dbg)printf("Half-length carryouts: cy0 = %20" PRIu64 ", cy1 = %20" PRIu64 "\n",cy0,cy1);
 #endif
 	// Compute radix-power; add 1 since used high-MUL version of the scaled-remainder algo ( = Algorithm A in the paper)
 	rpow = radix_power64(q,qinv,len2+1);
@@ -6548,8 +6548,8 @@ ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!");
 	MONT_MUL64(cy1,rpow,q,qinv,cy1);	// cy1*B^p (mod q)
 #if MI64_ISDIV_U2_DBG
 	if(dbg) {
-		printf("s1     mod q) = %20llu\n",cy0);
-		printf("s2*B^p mod q) = %20llu\n",cy1);
+		printf("s1     mod q) = %20" PRIu64 "\n",cy0);
+		printf("s2*B^p mod q) = %20" PRIu64 "\n",cy1);
 	}
 #endif
 	// Sum the scaled partial remainders:
@@ -6558,13 +6558,13 @@ ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!");
 	// Negation (mod q) needed for Algo A scaled remainder
 	if(cy0) cy0 = q-cy0 ;
 #if MI64_ISDIV_U2_DBG
-	if(dbg)printf("(s1 + s2*B^p) mod q = %20llu, q = %20llu\n",cy0,q);
+	if(dbg)printf("(s1 + s2*B^p) mod q = %20" PRIu64 ", q = %20" PRIu64 "\n",cy0,q);
 #endif
 	// One more modmul of sum by same power of the base gives true remainder - may as well, since we already have B^p handy:
 	MONT_MUL64(cy0,rpow,q,qinv,cy0);
 #if MI64_ISDIV_U2_DBG
 	if(dbg) {
-		printf("True mod x mod q = %20llu\n",cy0);
+		printf("True mod x mod q = %20" PRIu64 "\n",cy0);
 		exit(0);
 	}
 #endif
@@ -6588,13 +6588,13 @@ int mi64_is_div_by_scalar64_u4(const uint64 x[], uint64 q, uint32 len)
 	uint32 i,len4 = (len>>2),nshift;
 	uint64 qinv,cy0,cy1,cy2,cy3,rpow;
 
-	ASSERT(HERE, q > 0, "mi64_is_div_by_scalar64: 0 modulus!");
+	ASSERT(q > 0, "mi64_is_div_by_scalar64: 0 modulus!");
 	if(q == 1) return TRUE;
 	if(len == 0) return TRUE;
-	ASSERT(HERE, (len&3) == 0, "Length must be a multiple of 4!");
+	ASSERT((len&3) == 0, "Length must be a multiple of 4!");
 	/* q must be odd for Montgomery-style modmul to work, so first shift off any low 0s: */
 	nshift = trailz64(q);
-ASSERT(HERE, !nshift, "4-way folded ISDIV requires odd q!");
+ASSERT(!nshift, "4-way folded ISDIV requires odd q!");
 	if(nshift) {
 		if(trailz64(x[0]) < nshift) return FALSE;
 		q >>= nshift;
@@ -6724,7 +6724,7 @@ ASSERT(HERE, !nshift, "4-way folded ISDIV requires odd q!");
 #endif
 
 #if MI64_ISDIV_U4_DBG
-	if(dbg)printf("Half-length carryouts: cy0-3 = %20llu, %20llu, %20llu, %20llu\n",cy0,cy1,cy2,cy3);
+	if(dbg)printf("Half-length carryouts: cy0-3 = %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 "\n",cy0,cy1,cy2,cy3);
 #endif
 	// Compute radix-power; add 1 since used high-MUL version of the scaled-remainder algo ( = Algorithm A in the paper)
 	rpow = radix_power64(q,qinv,len4+1);
@@ -6736,13 +6736,13 @@ ASSERT(HERE, !nshift, "4-way folded ISDIV requires odd q!");
 	// Negation (mod q) needed for Algo A scaled remainder
 	if(cy0) cy0 = q-cy0 ;
 #if MI64_ISDIV_U4_DBG
-	if(dbg) printf("(sum0-3) mod q = %20llu, q = %20llu\n",cy0,q);
+	if(dbg) printf("(sum0-3) mod q = %20" PRIu64 ", q = %20" PRIu64 "\n",cy0,q);
 #endif
 	// One more modmul of sum by same power of the base gives true remainder:
 	MONT_MUL64(cy0,rpow,q,qinv,cy0);
 #if MI64_ISDIV_U4_DBG
 	if(dbg) {
-		printf("True mod x mod q = %20llu\n",cy0);
+		printf("True mod x mod q = %20" PRIu64 "\n",cy0);
 		exit(0);
 	}
 #endif
@@ -6768,13 +6768,13 @@ uint64 mi64_div_by_scalar64(const uint64 x[], uint64 q, uint32 len, uint64 y[])
 	uint64 qinv,tmp = 0,bw,cy,lo,rem64,rem_save = 0,itmp64,mask,*iptr;
 	double fquo,fqinv;
 /* Debug:
-printf("x[]/q, quotient q = %llu, base b = 2^64\n",q);
+printf("x[]/q, quotient q = %" PRIu64 ", base b = 2^64\n",q);
 for(i = 0; i < len; i++)
-	printf("x[%u] = %20llu;\n",i,x[i]);
+	printf("x[%u] = %20" PRIu64 ";\n",i,x[i]);
 printf("\n");
 */
-	ASSERT(HERE, (x != 0) && (len != 0), "Null input array or length parameter!");
-	ASSERT(HERE, q > 0, "0 modulus!");
+	ASSERT((x != 0) && (len != 0), "Null input array or length parameter!");
+	ASSERT(q > 0, "0 modulus!");
 	// Unit modulus needs special handling to return proper 0 remainder rather than 1:
 	if(q == 1ull) {
 		if(y) mi64_set_eq(y,x,len);
@@ -6802,7 +6802,7 @@ printf("\n");
 		rem_save = x[0] & mask;		// (Which we don`t do since x is read-only; thus we are forced into accounting tricks :)
 		q >>= nshift;
 	}
-	ASSERT(HERE, (q & (uint64)1) == 1, "q must be odd!");
+	ASSERT((q & (uint64)1) == 1, "q must be odd!");
 
 	uint32 q32,qi32;
 	q32  = q; qi32 = minv8[(q&0xff)>>1];
@@ -6814,12 +6814,12 @@ printf("\n");
 	if(dbg) {
 		printf("%s: nshift = %u, Input vector: x = 0;\n",func,nshift,q);
 		if(len > 100) {
-			printf("x[%u] = %20llu, ... x[0] = %20llu\n",len-1,x[len-1],x[0]);	// Pari-debug inputs; For every i++, shift count += 64
+			printf("x[%u] = %20" PRIu64 ", ... x[0] = %20" PRIu64 "\n",len-1,x[len-1],x[0]);	// Pari-debug inputs; For every i++, shift count += 64
 		} else {
-			for(i = 0; i < len; i++) printf("i = %u; x+=%20llu<<(i<<6);\n",i,x[i]);	// Pari-debug inputs; For every i++, shift count += 64
+			for(i = 0; i < len; i++) printf("i = %u; x+=%20" PRIu64 "<<(i<<6);\n",i,x[i]);	// Pari-debug inputs; For every i++, shift count += 64
 			printf("\n");
 		}
-		printf("q = %20llu; qinv = %20llu\n",q,qinv);
+		printf("q = %20" PRIu64 "; qinv = %20" PRIu64 "\n",q,qinv);
 	}
 #endif
 
@@ -6833,7 +6833,7 @@ printf("\n");
 		#if MI64_DIV_MONT64
 			bw = cy;	// Save a copy of the borrow flag for debug-printing
 			itmp64 = tmp + ((-cy)&q);	// Expected value of low-half of MUL_LOHI
-	//		if(dbg)printf("i = %4u, tmp*qinv = %20llu\n",i,tmp*qinv);
+	//		if(dbg)printf("i = %4u, tmp*qinv = %20" PRIu64 "\n",i,tmp*qinv);
 		#endif
 			tmp = tmp*qinv + cy;
 			// Do double-wide product. Fast-divisibility test needs just high half (stored in cy); low half (tmp) needed to extract true-mod
@@ -6843,8 +6843,8 @@ printf("\n");
 			MUL_LOHI64(q, tmp,  tmp, cy);
 		#endif
 		#if MI64_DIV_MONT64
-	//		if(dbg)printf("i = %4u, lo = %20llu, hi = %20llu, bw = %1u\n",i,tmp,cy,(uint32)bw);
-			ASSERT(HERE, itmp64 == tmp, "Low-half product check mismatch!");
+	//		if(dbg)printf("i = %4u, lo = %20" PRIu64 ", hi = %20" PRIu64 ", bw = %1u\n",i,tmp,cy,(uint32)bw);
+			ASSERT(itmp64 == tmp, "Low-half product check mismatch!");
 		#endif
 		}
 	} else {	// Even modulus, with or without quotient computation, uses Algo B
@@ -6871,8 +6871,8 @@ printf("\n");
 			MUL_LOHI64(q, tmp,  tmp, cy);
 		#endif
 		#if MI64_DIV_MONT64
-	//		if(dbg)printf("i = %4u, lo = %20llu, hi = %20llu, bw = %1u\n",i,tmp,cy,(uint32)bw);
-			ASSERT(HERE, *iptr == tmp, "Low-half product check mismatch!");
+	//		if(dbg)printf("i = %4u, lo = %20" PRIu64 ", hi = %20" PRIu64 ", bw = %1u\n",i,tmp,cy,(uint32)bw);
+			ASSERT(*iptr == tmp, "Low-half product check mismatch!");
 		#endif
 		}
 		// Last element has no shift-in from next-higher term, so can compute just the low-half output term, sans explicit MULs:
@@ -6881,7 +6881,7 @@ printf("\n");
 		cy = (cy > *iptr);
 		tmp = tmp + ((-cy)&q);
 	#if MI64_DIV_MONT64
-	//	if(dbg)printf("i = %4u, lo_out = %20llu\n",i,tmp);
+	//	if(dbg)printf("i = %4u, lo_out = %20" PRIu64 "\n",i,tmp);
 	#endif
 	}
 
@@ -6910,7 +6910,7 @@ printf("\n");
 			rem64 = rem64 - q*(uint64)fquo;
 		}
 		if(rem64 != tmp%q) {
-			fprintf(stderr,"WARNING: Bad floating-point mod in mi64_div_by_scalar64! x = %llu, q = %llu: exact remainder = %llu, FP gives %llu.\n",x[0],q,tmp%q,rem64);
+			fprintf(stderr,"WARNING: Bad floating-point mod in mi64_div_by_scalar64! x = %" PRIu64 ", q = %" PRIu64 ": exact remainder = %" PRIu64 ", FP gives %" PRIu64 ".\n",x[0],q,tmp%q,rem64);
 			rem64 = tmp%q;	// Replace FP-approximation result with exact
 		}
 		if(y) {
@@ -6939,7 +6939,7 @@ printf("\n");
 	// current (partial) remainder and re-add the off-shifted part of the true remainder.
 	rem64 = (rem64 << nshift) + rem_save;
 #if MI64_DIV_MONT64
-	if(dbg)printf("True mod: x mod q = %20llu\n",rem64);
+	if(dbg)printf("True mod: x mod q = %20" PRIu64 "\n",rem64);
 #endif
 
 	if(!y)	// Only remainder needed
@@ -6956,7 +6956,7 @@ printf("\n");
 		bw = 0;	cy = rem64;
 		for(i = 0; i < len; ++i) {
 		#if MI64_DIV_MONT64
-	//		if(dbg && i%(len>>2) == 0)printf("bw = %1llu, cy%1u = %20llu\n",bw,i/(len>>2),cy);	// Use to debug loop-folded implemntation
+	//		if(dbg && i%(len>>2) == 0)printf("bw = %1" PRIu64 ", cy%1u = %20" PRIu64 "\n",bw,i/(len>>2),cy);	// Use to debug loop-folded implemntation
 		#endif
 			tmp = x[i] - bw - cy;
 			/*  Since may be working in-place, need an extra temp here due to asymmetry of subtract: */
@@ -6971,8 +6971,8 @@ printf("\n");
 			MUL_LOHI64(q, tmp,  lo, cy);
 		#endif
 		#if MI64_DIV_MONT64
-	//		if(dbg)printf("i = %4u, quot[i] = %20llu, lo1 = %20llu, lo2 = %20llu, hi = %20llu, bw = %1u\n",i,tmp,itmp64,lo,cy,(uint32)bw);
-			ASSERT(HERE, itmp64 == lo, "Low-half product check mismatch!");
+	//		if(dbg)printf("i = %4u, quot[i] = %20" PRIu64 ", lo1 = %20" PRIu64 ", lo2 = %20" PRIu64 ", hi = %20" PRIu64 ", bw = %1u\n",i,tmp,itmp64,lo,cy,(uint32)bw);
+			ASSERT(itmp64 == lo, "Low-half product check mismatch!");
 		#endif
 			y[i] = tmp;
 		}
@@ -6990,20 +6990,20 @@ printf("\n");
 			MUL_LOHI64(q, tmp,  lo, cy);
 		#endif
 		#if MI64_DIV_MONT64
-	//		if(dbg)printf("i = %4u, quot[i] = %20llu\n",i,tmp);
+	//		if(dbg)printf("i = %4u, quot[i] = %20" PRIu64 "\n",i,tmp);
 		#endif
 			y[i] = tmp;
 		}
 	}
-	ASSERT(HERE, bw == 0 && cy == 0, "bw/cy check!");
+	ASSERT(bw == 0 && cy == 0, "bw/cy check!");
 #if MI64_DIV_MONT64
 	if(dbg) {
-		printf("len = %u, q = %llu, nshift = %u, rem = %llu\n",len,q,nshift,rem64);
+		printf("len = %u, q = %" PRIu64 ", nshift = %u, rem = %" PRIu64 "\n",len,q,nshift,rem64);
 		if(len > 100) {
-			printf("Quotient y[%u] = %20llu, y[%u] = %20llu, ... y[0] = %20llu\n",len-1,y[len-1],len-2,y[len-2],y[0]);	// Pari-debug inputs; For every i++, shift count += 64
+			printf("Quotient y[%u] = %20" PRIu64 ", y[%u] = %20" PRIu64 ", ... y[0] = %20" PRIu64 "\n",len-1,y[len-1],len-2,y[len-2],y[0]);	// Pari-debug inputs; For every i++, shift count += 64
 		} else {
 			printf("Quotient y = 0;\n");
-			for(i = 0; i < len; i++) printf("i = %u; y+=%20llu<<(i<<6);\n",i,y[i]);	// Pari-debug inputs; For every i++, shift count += 64
+			for(i = 0; i < len; i++) printf("i = %u; y+=%20" PRIu64 "<<(i<<6);\n",i,y[i]);	// Pari-debug inputs; For every i++, shift count += 64
 			printf("\n");
 		}
 	}
@@ -7026,8 +7026,8 @@ uint64 mi64_div_by_scalar64_u2(uint64 x[], uint64 q, uint32 lenu, uint64 y[])	//
 #endif
 	int i,j,npad = (lenu&1),len = lenu + npad,len2 = (len>>1),nshift,lshift = -1;	// Pad to even length
 	uint64 qinv,cy0,cy1,rpow,rem_save = 0,xsave,itmp64,mask,*iptr0,*iptr1,ptr_incr;
-	ASSERT(HERE, (x != 0) && (len != 0), "Null input array or length parameter!");
-	ASSERT(HERE, q > 0, "0 modulus!");
+	ASSERT((x != 0) && (len != 0), "Null input array or length parameter!");
+	ASSERT(q > 0, "0 modulus!");
 	// Unit modulus needs special handling to return proper 0 remainder rather than 1:
 	if(q == 1ull) {
 		if(y) mi64_set_eq(y,x,len);
@@ -7204,7 +7204,7 @@ See similar behavior for 4-way-split version of the algorithm.
 #endif
 
 #if MI64_DIV_MONT64_U2
-	if(dbg)printf("Half-length carryouts: cy0 = %20llu, cy1 = %20llu\n",cy0,cy1);
+	if(dbg)printf("Half-length carryouts: cy0 = %20" PRIu64 ", cy1 = %20" PRIu64 "\n",cy0,cy1);
 #endif
 
 	if(!nshift) {	// Odd modulus uses Algo A
@@ -7224,7 +7224,7 @@ See similar behavior for 4-way-split version of the algorithm.
 	MONT_MUL64(cy0,rpow,q,qinv,cy0);
 
 #if MI64_DIV_MONT64_U2
-	if(dbg) printf("True mod %c = %20llu\n",'A'+(nshift != 0),cy0);
+	if(dbg) printf("True mod %c = %20" PRIu64 "\n",'A'+(nshift != 0),cy0);
 #endif
 
 	// If we applied an initial right-justify shift to the modulus, restore the shift to the
@@ -7255,7 +7255,7 @@ See similar behavior for 4-way-split version of the algorithm.
 		MULH64(q,tmp0, cy0);			MULH64(q,tmp1, cy1);
 	#endif
 	#if MI64_DIV_MONT64_U2
-		if(dbg)printf("quot[%2u] = %20llu, quot[%2u] = %20llu, bw0,1 = %1u,%1u, cy0,1 = %20llu,%20llu\n",i,tmp0,i+len2,tmp1,(uint32)bw0,(uint32)bw1,cy0,cy1);
+		if(dbg)printf("quot[%2u] = %20" PRIu64 ", quot[%2u] = %20" PRIu64 ", bw0,1 = %1u,%1u, cy0,1 = %20" PRIu64 ",%20" PRIu64 "\n",i,tmp0,i+len2,tmp1,(uint32)bw0,(uint32)bw1,cy0,cy1);
 	#endif
 		// Write quotient word(s):
 		y[i] = tmp0;					y[i+len2] = tmp1;
@@ -7339,7 +7339,7 @@ See similar behavior for 4-way-split version of the algorithm.
 
 #endif
 
-	ASSERT(HERE, cy1 == 0, "cy check!");	// all but the uppermost carryout are generally nonzero
+	ASSERT(cy1 == 0, "cy check!");	// all but the uppermost carryout are generally nonzero
 	x[lenu] = xsave;	// Restore input value of zero-padding one-beyond element x[lenu] prior to return
 	return rpow;
 }
@@ -7375,15 +7375,15 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[])
 	static uint64 *svec = 0x0;	// svec = "scratch vector"
 	if(first_entry) {
 		first_entry = FALSE;
-		svec = (uint64 *)calloc(len_save, sizeof(uint64));	ASSERT(HERE, svec != 0x0, "alloc failed!");
+		svec = (uint64 *)calloc(len_save, sizeof(uint64));	ASSERT(svec != 0x0, "alloc failed!");
 	}
 	if(len > len_save) {
 		len_save = len<<1;
-		svec = (uint64 *)realloc(svec, len_save*sizeof(uint64));	ASSERT(HERE, svec != 0x0, "alloc failed!");
+		svec = (uint64 *)realloc(svec, len_save*sizeof(uint64));	ASSERT(svec != 0x0, "alloc failed!");
 	}
 
-	ASSERT(HERE, (x != 0) && (len != 0), "Null input array or length parameter!");
-	ASSERT(HERE, q > 0, "0 modulus!");
+	ASSERT((x != 0) && (len != 0), "Null input array or length parameter!");
+	ASSERT(q > 0, "0 modulus!");
 	// Unit modulus needs special handling to return proper 0 remainder rather than 1:
 	if(q == 1ull) {
 		if(y) mi64_set_eq(y,x,len);
@@ -7798,7 +7798,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[])
 #endif
 
 #if MI64_DIV_MONT64_U4
-	if(dbg)printf("Half-length carryouts: cy0-3 = %20llu, %20llu, %20llu, %20llu\n",cy0,cy1,cy2,cy3);
+	if(dbg)printf("Half-length carryouts: cy0-3 = %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 "\n",cy0,cy1,cy2,cy3);
 #endif
 
 #ifdef USE_AVX2
@@ -7830,7 +7830,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[])
 	MONT_MUL64(cy0,rpow,q,qinv,cy0);
 
 #if MI64_DIV_MONT64_U4
-	if(dbg) printf("True mod %c = %20llu\n",'A'+(nshift != 0),cy0);
+	if(dbg) printf("True mod %c = %20" PRIu64 "\n",'A'+(nshift != 0),cy0);
 #endif
 
 	// If we applied an initial right-justify shift to the modulus, restore the shift to the
@@ -7862,7 +7862,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[])
 		MULH64(q,tmp0, cy0);		MULH64(q,tmp1, cy1);		MULH64(q,tmp2, cy2);		MULH64(q,tmp3, cy3);
 	#endif
 	#if MI64_DIV_MONT64_U4
-		if(dbg)printf("quot[%2u,%2u,%2u,%2u] = %20llu,%20llu,%20llu,%20llu, bw0-3 = %1u,%1u,%1u,%1u, cy0-3 = %20llu,%20llu,%20llu,%20llu\n",i0,i1,i2,i3,tmp0,tmp1,tmp2,tmp3,(uint32)bw0,(uint32)bw1,(uint32)bw2,(uint32)bw3,cy0,cy1,cy2,cy3);
+		if(dbg)printf("quot[%2u,%2u,%2u,%2u] = %20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 ", bw0-3 = %1u,%1u,%1u,%1u, cy0-3 = %20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 "\n",i0,i1,i2,i3,tmp0,tmp1,tmp2,tmp3,(uint32)bw0,(uint32)bw1,(uint32)bw2,(uint32)bw3,cy0,cy1,cy2,cy3);
 	#endif
 		// Write quotient words:
 		y[i] = tmp0;				y[i+len4] = tmp1;			y[i+len2] = tmp2;			y[i+len2+len4] = tmp3;
@@ -7992,7 +7992,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[])
   #endif	// AVX2/MULX or not?
 
 #endif
-	ASSERT(HERE, cy3 == 0, "cy check!");	// all but the uppermost carryout are generally nonzero
+	ASSERT(cy3 == 0, "cy check!");	// all but the uppermost carryout are generally nonzero
 	// Restore input values of 0-pad elements prior to return:
 	for(i = 0; i < npad; i++) {
 		x[lenu+i] = pads[i];
@@ -8053,7 +8053,7 @@ uint32 mi64_div_y32(uint64 x[], uint32 y, uint64 q[], uint32 len)
 		rem = tsum%y;
 	}
 	if(rem == 0 && x != q) {	// If overwrote input with quotient in above loop, skip this
-		ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)x, y, len), "Results of mi64_div_y32 and mi64_is_div_by_scalar32 differ!");
+		ASSERT(mi64_is_div_by_scalar32((uint32 *)x, y, len), "Results of mi64_div_y32 and mi64_is_div_by_scalar32 differ!");
 		return 0;
 	}
 	return (uint32)rem;
@@ -8083,7 +8083,7 @@ int	__convert_mi64_base10_char(char char_buf[], uint32 n_alloc_chars, const uint
 	double dtmp = 0.0;
 	static uint64 *temp = 0x0;
 	static uint32 tlen = 0;	// #64-bit slots in current memalloc for *temp
-	ASSERT(HERE, fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!");	// Make sure these scaling powers have been inited
+	ASSERT(fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!");	// Make sure these scaling powers have been inited
 
 	/* Estimate # of decimal digits: */
 	curr_len = mi64_getlen(x, len);	/* this checks that len > 0; need at least one digit, even if it = 0. curr_len guaranteed > 0. */
@@ -8092,7 +8092,7 @@ int	__convert_mi64_base10_char(char char_buf[], uint32 n_alloc_chars, const uint
 		if(temp) {
 			free((void *)temp);	temp = 0x0;
 		}
-		temp = (uint64 *)calloc(curr_len, sizeof(uint64));	ASSERT(HERE, temp != 0x0, "alloc failed!");
+		temp = (uint64 *)calloc(curr_len, sizeof(uint64));	ASSERT(temp != 0x0, "alloc failed!");
 		tlen = curr_len;
 	}
 	mi64_set_eq(temp, x, curr_len);
@@ -8100,7 +8100,7 @@ int	__convert_mi64_base10_char(char char_buf[], uint32 n_alloc_chars, const uint
 	if(curr_len > 1) dtmp = x[curr_len-2]*TWO64FLINV;
 	MAX_DIGITS = ceil( (curr_len-1)*log10_base + log((double)x[curr_len-1] + dtmp)/ln10 );
 	MAX_DIGITS = MAX(MAX_DIGITS, 1);
-	ASSERT(HERE, MAX_DIGITS < n_alloc_chars, "Output string overflows buffer");
+	ASSERT(MAX_DIGITS < n_alloc_chars, "Output string overflows buffer");
 	if(wrap_every) {
 		MAX_DIGITS += MAX_DIGITS/wrap_every;
 	}
@@ -8151,7 +8151,7 @@ int	__convert_mi64_base10_char_print_lead0(char char_buf[], uint32 n_alloc_chars
 	double dtmp = 0.0;
 	static uint64 *temp = 0x0;
 	static uint32 tlen = 0;	// #64-bit slots in current memalloc for *temp
-	ASSERT(HERE, fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!");	// Make sure these scaling powers have been inited
+	ASSERT(fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!");	// Make sure these scaling powers have been inited
 
 	/* Estimate # of decimal digits: */
 	curr_len = mi64_getlen(x, len);	/* this checks that len > 0; need at least one digit, even if it = 0. curr_len guaranteed > 0. */
@@ -8160,7 +8160,7 @@ int	__convert_mi64_base10_char_print_lead0(char char_buf[], uint32 n_alloc_chars
 		if(temp) {
 			free((void *)temp);	temp = 0x0;
 		}
-		temp = (uint64 *)calloc(curr_len, sizeof(uint64));	ASSERT(HERE, temp != 0x0, "alloc failed!");
+		temp = (uint64 *)calloc(curr_len, sizeof(uint64));	ASSERT(temp != 0x0, "alloc failed!");
 		tlen = curr_len;
 	}
 	mi64_set_eq(temp, x, curr_len);
@@ -8168,11 +8168,11 @@ int	__convert_mi64_base10_char_print_lead0(char char_buf[], uint32 n_alloc_chars
 	if(curr_len > 1) dtmp = x[curr_len-2]*TWO64FLINV;
 	MAX_DIGITS = ceil( (curr_len-1)*log10_base + log((double)x[curr_len-1] + dtmp)/ln10 );
 	if(MAX_DIGITS > ndigit) {
-		ASSERT(HERE, 0, "ERROR: MAX_DIGITS > ndigit!");
+		ASSERT(0, "ERROR: MAX_DIGITS > ndigit!");
 	} else {
 		MAX_DIGITS = ndigit;
 	}
-	ASSERT(HERE, MAX_DIGITS < n_alloc_chars, "Output string overflows buffer");
+	ASSERT(MAX_DIGITS < n_alloc_chars, "Output string overflows buffer");
 	if(wrap_every) {
 		MAX_DIGITS += MAX_DIGITS/wrap_every;
 	}
@@ -8243,7 +8243,7 @@ uint64 *convert_base10_char_mi64(const char*char_buf, uint32 *len)
 		LEN_MAX = (uint32)ceil( (imax-i)/log10_base );
 	}
 	// 01/09/2009: Add an extra zero-pad element here as workaround for bug in mi64_div called with differing-length operands:
-	mi64_vec = (uint64 *)calloc(LEN_MAX+1, sizeof(uint64));	ASSERT(HERE, mi64_vec != 0x0, "alloc failed!");
+	mi64_vec = (uint64 *)calloc(LEN_MAX+1, sizeof(uint64));	ASSERT(mi64_vec != 0x0, "alloc failed!");
 	imin = i;
 	for(i = imin; i < imax; i++) {
 		c = char_buf[i];
@@ -8251,19 +8251,19 @@ uint64 *convert_base10_char_mi64(const char*char_buf, uint32 *len)
 			free((void *)mi64_vec);	*len = 0;	return 0x0;
 		}
 		curr_digit = (uint64)(c - CHAROFFSET);
-		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
+		ASSERT(curr_digit < 10,"util.c: curr_digit < 10");
 		/* currsum *= 10, and check for overflow: */
 		tmp = mi64_mul_scalar(mi64_vec, (uint64)10, mi64_vec, *len);
 		if(tmp != 0) {
 			if(*len == LEN_MAX) {
 				printf("ERROR: Mul-by-10 overflows in convert_base10_char_mi64: Offending input string = %s\n", char_buf);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 			mi64_vec[(*len)++] = tmp;
 		}
 
 		*len += mi64_add_scalar(mi64_vec, curr_digit, mi64_vec, *len);
-		ASSERT(HERE, *len <= LEN_MAX,"len <= LEN_MAX");
+		ASSERT(*len <= LEN_MAX,"len <= LEN_MAX");
 	}
 	*len = LEN_MAX;	/* Nominal length, so user knows how much memory was allocated */
 	return mi64_vec;
@@ -8296,8 +8296,8 @@ and returns 1 if 2^(-p) == -1 (mod q) (which also means 2^p == -1), 0 otherwise.
 #endif
 uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], uint32 len, uint64*res)
 {
-	ASSERT(HERE, p != 0x0, "Null p-array pointer!");
-	ASSERT(HERE, q != 0x0, "Null q-array pointer!");
+	ASSERT(p != 0x0, "Null p-array pointer!");
+	ASSERT(q != 0x0, "Null q-array pointer!");
 	uint32 pow2, FERMAT = mi64_isPow2(p,len,&pow2)<<1;	// *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case
   #if MI64_POW_DBG
 	uint32 dbg = FERMAT && pow2 == 256;//STREQ(&s0[convert_mi64_base10_char(s0, q, len, 0)], "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127");
@@ -8311,7 +8311,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 	uint64 lead_chunk, lo64, cyout;
 	uint32 lenP, lenQ, qbits, log2_numbits, start_index, zshift;
   #if MI64_POW_DBG
-	if(dbg) printf("mi64_twopmodq: F%u with k = %llu\n",pow2,k);
+	if(dbg) printf("mi64_twopmodq: F%u with k = %" PRIu64 "\n",pow2,k);
   #endif
 	if(first_entry) {
 		first_entry = FALSE;
@@ -8321,8 +8321,8 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 		x      = (uint64 *)calloc((lenq_save  ), sizeof(uint64));
 		lo     = (uint64 *)calloc((2*lenq_save), sizeof(uint64));
 	}
-	lenP = mi64_getlen(p, len_p);	ASSERT(HERE, lenP > 0, "0 exponent");
-	lenQ = mi64_getlen(q, len);		ASSERT(HERE, lenQ > 0, "0 modulus!");
+	lenP = mi64_getlen(p, len_p);	ASSERT(lenP > 0, "0 exponent");
+	lenQ = mi64_getlen(q, len);		ASSERT(lenQ > 0, "0 modulus!");
 	if(len_p > lenp_save) {
 		lenp_save = len_p;
 		pshift = (uint64 *)realloc(pshift, (len_p+1)*sizeof(uint64));
@@ -8335,21 +8335,21 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 		x      = (uint64 *)realloc(x     , (lenQ   )*sizeof(uint64));
 		lo     = (uint64 *)realloc(lo    , (2*lenQ )*sizeof(uint64));
 	}
-	ASSERT(HERE, pshift != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0, "alloc failed!");
+	ASSERT(pshift != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0, "alloc failed!");
 	hi = lo + lenQ;	// Pointer to high half of double-wide product
 
   #if MI64_POW_DBG
-	if(dbg) printf("mi64_twopmodq: k = %llu, len = %u, lenQ = %u\n",k,len,lenQ);
+	if(dbg) printf("mi64_twopmodq: k = %" PRIu64 ", len = %u, lenQ = %u\n",k,len,lenQ);
   #endif
 	qbits = lenQ << 6;
 	mi64_shrl_short(q, qhalf, 1, lenQ);	/* (q >> 1) = (q-1)/2, since q odd. */
 
 	/* pshift = p + len*64 */
 	pshift[lenP] = mi64_add_scalar(p, lenQ*64, pshift, lenP);	// April 2015: lenP ==> lenQ here!
-	ASSERT(HERE, !pshift[lenP], "pshift overflows!");
+	ASSERT(!pshift[lenP], "pshift overflows!");
 
   #if MI64_POW_DBG
-	if(dbg) printf("Init: k = %llu, lenP = %u, lenQ = %u\n",k,lenP,lenQ);
+	if(dbg) printf("Init: k = %" PRIu64 ", lenP = %u, lenQ = %u\n",k,lenP,lenQ);
   #endif
 	log2_numbits = ceil(log(1.0*qbits)/log(2.0));
 	/*
@@ -8369,7 +8369,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 	*/
 	/* Extract leftmost log2_numbits bits of pshift (if >= qbits, use the leftmost log2_numbits-1) and subtract from qbits: */
 	pbits = mi64_extract_lead64(pshift,len_p,&lo64);
-	ASSERT(HERE, pbits >= log2_numbits, "leadz64!");
+	ASSERT(pbits >= log2_numbits, "leadz64!");
 //	if(pbits >= 64)
 		lead_chunk = lo64>>(64-log2_numbits);
 //	else
@@ -8378,12 +8378,12 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 	if(lead_chunk >= qbits) {
 		lead_chunk >>= 1;
 	#if MI64_POW_DBG
-		if(dbg) printf("lead%u = %llu\n", log2_numbits-1,lead_chunk);
+		if(dbg) printf("lead%u = %" PRIu64 "\n", log2_numbits-1,lead_chunk);
 	#endif
 		start_index = pbits-(log2_numbits-1);	/* Use only the leftmost log2_numbits-1 bits */
 	} else {
 	#if MI64_POW_DBG
-		if(dbg) printf("lead%u = %llu\n", log2_numbits  ,lead_chunk);
+		if(dbg) printf("lead%u = %" PRIu64 "\n", log2_numbits  ,lead_chunk);
 	#endif
 		start_index = pbits-log2_numbits;
 	}
@@ -8405,7 +8405,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 	Init qinv = q. This formula returns the correct bottom 5 bits of qinv,
 	and we double the number of correct bits on each of the subsequent iterations.
 	*/
-	ASSERT(HERE, (q[0] & (uint64)1) == 1, "q must be odd!");
+	ASSERT((q[0] & (uint64)1) == 1, "q must be odd!");
 	mi64_clear(qinv, lenQ);
 
 	/* Newton iteration involves repeated steps of form
@@ -8429,7 +8429,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 	}
 	// Check the computed inverse:
 	mi64_mul_vector_lo_half(q, qinv, x, lenQ);
-	ASSERT(HERE, mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!");
+	ASSERT(mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!");
   #if MI64_POW_DBG
 	if(dbg) {
 		printf("q    = %s\n", &cbuf[convert_mi64_base10_char(cbuf, q   , lenQ, 0)]);
@@ -8455,14 +8455,14 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 	if(dbg) printf("q*lo/2^%u = %s\n", (lenQ<<6), &cbuf[convert_mi64_base10_char(cbuf, lo, lenQ, 0)]);
   #endif
 	/* hi = 0 in this instance, which simplifies things. */
-	cyout = mi64_sub(q, lo, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+	cyout = mi64_sub(q, lo, x, lenQ);	ASSERT(cyout == 0ull, "");
 	if(mi64_test_bit(pshift, j)) {
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(mi64_cmpugt(x, qhalf, lenQ)) {
 			cyout = mi64_add(x, x, x, lenQ);
 			cyout = mi64_sub(x, q, x, lenQ);
 		} else {
-			cyout = mi64_add(x, x, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+			cyout = mi64_add(x, x, x, lenQ);	ASSERT(cyout == 0ull, "");
 		}
 	}
   #if MI64_POW_DBG
@@ -8494,7 +8494,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 			if(!mi64_cmp_eq(lo,x,lenQ)) {
 				printf("lo = MULH_QFERM = %s\n", &cbuf[convert_mi64_base10_char(cbuf,lo, lenQ, 0)] );
 				printf("lo = MULH       = %s\n", &cbuf[convert_mi64_base10_char(cbuf, x, lenQ, 0)] );
-				printf("Mismatch! pow2 = %u, k = %llu\n",pow2,k);
+				printf("Mismatch! pow2 = %u, k = %" PRIu64 "\n",pow2,k);
 				exit(0);
 			}
 		}
@@ -8514,14 +8514,14 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 			cyout = mi64_sub(q, lo, lo, lenQ);
 			cyout = mi64_add(lo, hi, x, lenQ);
 		} else {
-			cyout = mi64_sub(hi, lo, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+			cyout = mi64_sub(hi, lo, x, lenQ);	ASSERT(cyout == 0ull, "");
 		}
 
 		if(mi64_test_bit(pshift, j)) {
 		#if MI64_POW_DBG
 			if(dbg) printf("2x...\n");
 		#endif
-			ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q");
+			ASSERT(mi64_cmpult(x, q, lenQ), "x >= q");
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(mi64_cmpugt(x, qhalf, lenQ)) {
 			#if MI64_POW_DBG
@@ -8530,7 +8530,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[],
 				cyout = mi64_add(x, x, x, lenQ);
 				cyout = mi64_sub(x, q, x, lenQ);
 			} else {
-				cyout = mi64_add(x, x, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+				cyout = mi64_add(x, x, x, lenQ);	ASSERT(cyout == 0ull, "");
 			}
 		}
 		#if MI64_POW_DBG
@@ -8576,7 +8576,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 	static uint32  first_entry = TRUE;
 
 	// Quick computation of number of uint64 needed to hold current q:
-	ASSERT(HERE, (k != 0) && ((k2>>1) == k), "2*k overflows!");	// Make sure 2*k does not overflow
+	ASSERT((k != 0) && ((k2>>1) == k), "2*k overflows!");	// Make sure 2*k does not overflow
 	j = (p+1)&63;	// p+1 mod 64, needed since q = 2*k*MMp+1 ~= k*MM(p+1)
 	lenP = ((p+1) + 63)>>6;	// #64-bit words needed
 	lo64 = k;		// Copy of k
@@ -8586,7 +8586,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 		lenQ = lenP;
 	}
   #if MI64_POW_DBG
-	if(dbg) { printf("mi64_twopmodq_qmmp: k = %llu, lenP = %u, lenQ = %u\n",k,lenP,lenQ); }
+	if(dbg) { printf("mi64_twopmodq_qmmp: k = %" PRIu64 ", lenP = %u, lenQ = %u\n",k,lenP,lenQ); }
   #endif
 
 	if(first_entry || (p != psave) || (lenQ != lenQ_save))
@@ -8594,15 +8594,15 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 		first_entry = FALSE;
 		psave = p;
 		free((void *)pshift);
-		pshift = (uint64 *)calloc((lenP+1), sizeof(uint64));	ASSERT(HERE, pshift != 0x0, "calloc of pshift[] failed!");
+		pshift = (uint64 *)calloc((lenP+1), sizeof(uint64));	ASSERT(pshift != 0x0, "calloc of pshift[] failed!");
 		pshift[0] = 1;
 		mi64_shl(pshift, pshift, p, lenP);	// 2^p
 		mi64_sub_scalar(pshift, 1, pshift, lenP);	// M(p) = 2^p-1
 		/* pshift = p + len*64: */
 		pshift[lenP] = mi64_add_scalar(pshift, lenP*64, pshift, lenP);
-		ASSERT(HERE, !pshift[lenP], "pshift overflows!");
+		ASSERT(!pshift[lenP], "pshift overflows!");
 	#if MI64_POW_DBG
-		if(dbg) { printf("mi64_twopmodq_qmmp: Init: k = %llu, lenP = %u, lenQ = %u\n",k,lenP,lenQ); }
+		if(dbg) { printf("mi64_twopmodq_qmmp: Init: k = %" PRIu64 ", lenP = %u, lenQ = %u\n",k,lenP,lenQ); }
 	#endif
 		lenQ_save = lenQ;
 		free((void *)q    );
@@ -8616,7 +8616,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 		x      = (uint64 *)calloc((lenQ), sizeof(uint64));
 		lo   = (uint64 *)calloc((2*lenQ), sizeof(uint64));
 		hi   = lo + lenQ;	/* Pointer to high half of double-wide product */
-		ASSERT(HERE, q != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0 && hi != 0x0, "alloc failed!");
+		ASSERT(q != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0 && hi != 0x0, "alloc failed!");
 		qbits = lenQ << 6;
 		log2_numbits = ceil(log(1.0*qbits)/log(2.0));
 
@@ -8637,7 +8637,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 		*/
 	/* Extract leftmost log2_numbits bits of pshift (if >= qbits, use the leftmost log2_numbits-1) and subtract from qbits: */
 		pbits = mi64_extract_lead64(pshift,lenP,&lo64);
-		ASSERT(HERE, pbits >= log2_numbits, "leadz64!");
+		ASSERT(pbits >= log2_numbits, "leadz64!");
 	//	if(pbits >= 64)
 			lead_chunk = lo64>>(64-log2_numbits);
 	//	else
@@ -8646,12 +8646,12 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 		if(lead_chunk >= qbits) {
 			lead_chunk >>= 1;
 		#if MI64_POW_DBG
-			if(dbg) { printf("lead%u = %llu\n", log2_numbits-1,lead_chunk); }
+			if(dbg) { printf("lead%u = %" PRIu64 "\n", log2_numbits-1,lead_chunk); }
 		#endif
 			start_index = pbits-(log2_numbits-1);	/* Use only the leftmost log2_numbits-1 bits */
 		} else {
 		#if MI64_POW_DBG
-			if(dbg) { printf("lead%u = %llu\n", log2_numbits  ,lead_chunk); }
+			if(dbg) { printf("lead%u = %" PRIu64 "\n", log2_numbits  ,lead_chunk); }
 		#endif
 			start_index = pbits-log2_numbits;
 		}
@@ -8670,8 +8670,8 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 	q[0] = 1; mi64_shl(q, q, p, lenQ);
 	mi64_sub_scalar(q, 1, q, lenQ);	// M(p) = 2^p-1
 	cyout = mi64_mul_scalar(q, k2, q, lenQ);
-	ASSERT(HERE, !cyout, "2.k.M(p) overflows!");	// 2.k.M(p)
-	ASSERT(HERE, 0 != q[lenQ-1], "Excessive word size allocated for q!");
+	ASSERT(!cyout, "2.k.M(p) overflows!");	// 2.k.M(p)
+	ASSERT(0 != q[lenQ-1], "Excessive word size allocated for q!");
 	mi64_add_scalar(q, 1ull, q, lenQ);	// q = 2.k.M(p) + 1
 	mi64_shrl_short(q, qhalf, 1, lenQ);	/* (q >> 1) = (q-1)/2, since q odd. */
   #else
@@ -8679,8 +8679,8 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 	j = p>>6;	// p/64; the set-bit in 2^p goes into the (j)th word of q[]
 	q[j] = ( 1ull << (p-(j<<6)) );
 	mi64_sub_scalar(q, 1, q, lenQ);	// M(p) = 2^p-1
-	cyout = mi64_mul_scalar(q, k2, q, lenQ);	ASSERT(HERE, !cyout, "2.k.M(p) overflows!");	// 2.k.M(p)
-	ASSERT(HERE, 0 != q[lenQ-1], "Excessive word size allocated for q!");
+	cyout = mi64_mul_scalar(q, k2, q, lenQ);	ASSERT(!cyout, "2.k.M(p) overflows!");	// 2.k.M(p)
+	ASSERT(0 != q[lenQ-1], "Excessive word size allocated for q!");
 	mi64_add_scalar(q, 1ull, q, lenQ);	// q = 2.k.M(p) + 1
 	mi64_shrl_short_short(q,qhalf, 1, lenQ);	// qhalf = (q >> 1) = (q-1)/2, since q odd.
   #endif
@@ -8688,7 +8688,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 	Find modular inverse (mod 2^qbits) of q in preparation for modular multiply.
 	q must be odd for Montgomery-style modmul to work.
 	*/
-	ASSERT(HERE, (q[0] & (uint64)1) == 1, "q must be odd!");
+	ASSERT((q[0] & (uint64)1) == 1, "q must be odd!");
 	mi64_clear(qinv, lenQ);
 
 	/* Newton iteration involves repeated steps of form
@@ -8712,7 +8712,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 	}
 	// Check the computed inverse:
 	mi64_mul_vector_lo_half(q, qinv, x, lenQ);
-	ASSERT(HERE, mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!");
+	ASSERT(mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!");
   #if MI64_POW_DBG
 	if(dbg) {
 		printf("q    = %s\n", &cbuf[convert_mi64_base10_char(cbuf, q   , lenQ, 0)]);
@@ -8738,17 +8738,17 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
   #endif
 
 	/* hi = 0 in this instance, which simplifies things. */
-	cyout = mi64_sub(q, lo, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+	cyout = mi64_sub(q, lo, x, lenQ);	ASSERT(cyout == 0ull, "");
 
 	// mi64_test_bit(pshift, j) always true for this portion of MMp powering
-	ASSERT(HERE, mi64_test_bit(pshift, j), "pshift bit = 0 for pre-loop step!");
-	ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q");
+	ASSERT(mi64_test_bit(pshift, j), "pshift bit = 0 for pre-loop step!");
+	ASSERT(mi64_cmpult(x, q, lenQ), "x >= q");
 	/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 	if(mi64_cmpugt(x, qhalf, lenQ)) {
 		cyout = mi64_add(x, x, x, lenQ);
 		cyout = mi64_sub(x, q, x, lenQ);
 	} else {
-		cyout = mi64_add(x, x, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+		cyout = mi64_add(x, x, x, lenQ);	ASSERT(cyout == 0ull, "");
 	}
 
   #if MI64_POW_DBG
@@ -8782,21 +8782,21 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 			cyout = mi64_sub(q, lo, lo, lenQ);
 			cyout = mi64_add(lo, hi, x, lenQ);
 		} else {
-			cyout = mi64_sub(hi, lo, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+			cyout = mi64_sub(hi, lo, x, lenQ);	ASSERT(cyout == 0ull, "");
 		}
 
 	#if MI64_POW_DBG
 		if(dbg) { printf("x = %s\n",&cbuf[convert_mi64_base10_char(cbuf, x, lenQ, 0)]); }
 	#endif
 		// mi64_test_bit(pshift, j) always true for this portion of MMp powering
-		ASSERT(HERE, mi64_test_bit(pshift, j), "pshift bit = 0!");
+		ASSERT(mi64_test_bit(pshift, j), "pshift bit = 0!");
 	#if MI64_POW_DBG
 		if(!mi64_cmpult(x, q, lenQ)) {
-			printf("x < q test failed for k = %llu, j = %u!\n",k,j);
+			printf("x < q test failed for k = %" PRIu64 ", j = %u!\n",k,j);
 		}
 		if(dbg) { printf("2x...\n"); }
 	#else
-		ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q");
+		ASSERT(mi64_cmpult(x, q, lenQ), "x >= q");
 	#endif
 
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
@@ -8804,7 +8804,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 			cyout = mi64_add(x, x, x, lenQ);
 			cyout = mi64_sub(x, q, x, lenQ);
 		} else {
-			cyout = mi64_add(x, x, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+			cyout = mi64_add(x, x, x, lenQ);	ASSERT(cyout == 0ull, "");
 		}
 	}
 	for(; j >= 0; j--)
@@ -8818,17 +8818,17 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32
 			cyout = mi64_sub(q, lo, lo, lenQ);
 			cyout = mi64_add(lo, hi, x, lenQ);
 		} else {
-			cyout = mi64_sub(hi, lo, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+			cyout = mi64_sub(hi, lo, x, lenQ);	ASSERT(cyout == 0ull, "");
 		}
 
 		if((pshift[0] >> j) & (uint64)1) {
-			ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q");
+			ASSERT(mi64_cmpult(x, q, lenQ), "x >= q");
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(mi64_cmpugt(x, qhalf, lenQ)) {
 				cyout = mi64_add(x, x, x, lenQ);
 				cyout = mi64_sub(x, q, x, lenQ);
 			} else {
-				cyout = mi64_add(x, x, x, lenQ);	ASSERT(HERE, cyout == 0ull, "");
+				cyout = mi64_add(x, x, x, lenQ);	ASSERT(cyout == 0ull, "");
 			}
 		}
 	}
diff --git a/src/pairFFT_mul.c b/src/pairFFT_mul.c
index 4b98958a..03890546 100755
--- a/src/pairFFT_mul.c
+++ b/src/pairFFT_mul.c
@@ -122,7 +122,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 	static int init_sse2 = FALSE;
 	int thr_id = -1;	// No multithread support yet.
 
-	ASSERT(HERE, ((uint32)FFT_MUL_BASE >> 16) == 1, "FFT_MUL_BASE != 2^16");
+	ASSERT(((uint32)FFT_MUL_BASE >> 16) == 1, "FFT_MUL_BASE != 2^16");
 
 	/***
 	Having a separate init block for the big index array allows us to init this prior
@@ -131,7 +131,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 	if(INIT_ARRAYS)
 	{
 		/* In init mode, x-input array used for temporary storage: */
-		ASSERT(HERE, x != 0x0, "if INIT_ARRAYS = TRUE, x-input array must be non-null!");
+		ASSERT(x != 0x0, "if INIT_ARRAYS = TRUE, x-input array must be non-null!");
 
 		/* Reset this on an INIT_ARRAYS call to ensure that the
 		radix_set != radix_set_save code below gets executed in that case: */
@@ -145,7 +145,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		n2inv = 1.0/(N2);
 
 		/* Only power-of-2 FFT lengths supported for now: */
-		ASSERT(HERE, (n>>trailz32(n)) == 1,"Only power-of-2 FFT lengths supported!");
+		ASSERT((n>>trailz32(n)) == 1,"Only power-of-2 FFT lengths supported!");
 
 		// Use get_fft_radices' zero-index radix set (guaranteed to be available if the FFT length is supported)
 		// to find how many different radsets available at this length, then loop over them (including the 0-one)
@@ -157,13 +157,13 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 			retval = get_fft_radices(n>>10, radix_set, &NRADICES, RADIX_VEC, 10);
 			if(retval == ERR_FFTLENGTH_ILLEGAL) {
 				sprintf(char_str, "ERROR: length %d = %d K not available.\n", n, n>>10);
-				ASSERT(HERE, 0, char_str);
+				ASSERT(0, char_str);
 			} else if(retval == ERR_RADIXSET_UNAVAILABLE) {
 				sprintf(char_str, "ERROR: radix set %10d not available.\n",radix_set);
-				ASSERT(HERE, 0, char_str);
+				ASSERT(0, char_str);
 			} else if(retval != 0) {
 				sprintf(char_str, "ERROR: unknown return value %d from get_fft_radix; N = %d, kblocks = %u, radset = %u.\n", retval, n, kblocks, radix_set);
-				ASSERT(HERE, 0, char_str);
+				ASSERT(0, char_str);
 			}
 			// Make sure n/radix_vec0 >= 1024:
 			if(n/RADIX_VEC[0] < 1024)
@@ -171,7 +171,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 			if( (RADIX_VEC[NRADICES-1] == 16) && (RADIX_VEC[0] == 8 || RADIX_VEC[0] == 16 || RADIX_VEC[0] == 32) )
 				break;
 		}
-		ASSERT(HERE, radix_set < nradsets, "Unable to find suitable radix set!");
+		ASSERT(radix_set < nradsets, "Unable to find suitable radix set!");
 		radix_vec0 = RADIX_VEC[0];
 		radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)radix_vec0));
 		nchunks = radix_vec0>>1;
@@ -179,24 +179,24 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		/* My array padding scheme requires N/radix_vec0 to be a power of 2, and to be >= 2^DAT_BITS, where the latter
 		parameter is set in the Mdata.h file: */
 		if(n%radix_vec0 != 0) {
-			ASSERT(HERE, 0, "ERROR: RADIX_VEC[0] does not divide N!\n");
+			ASSERT(0, "ERROR: RADIX_VEC[0] does not divide N!\n");
 		}
 
 		/* Make sure n/radix_vec0 is a power of 2: */
 		i = n/radix_vec0;
 		if((i >> trailz32(i)) != 1) {
-			ASSERT(HERE, 0, "ERROR: n/RADIX_VEC[0] not a power of 2!\n");
+			ASSERT(0, "ERROR: n/RADIX_VEC[0] not a power of 2!\n");
 		}
 
 		/*...Set the array padding parameters - only use array padding elements for runlengths > 32K. */
 		if(DAT_BITS < 31) {
 			/*...If array padding turned on, check that the blocklength divides the unpadded runlength...	*/
-			ASSERT(HERE, ((n >> DAT_BITS) << DAT_BITS) == n,"ERROR: blocklength does not divide runlength!");
+			ASSERT(((n >> DAT_BITS) << DAT_BITS) == n,"ERROR: blocklength does not divide runlength!");
 
 			/* Now make sure n/RADIX_VEC[0] is sufficiently large (unless n < 2^DAT_BITS, in which case it doesn't matter): */
 			if(i < (1 << DAT_BITS)) {
 				sprintf(char_str, "ERROR: n/RADIX_VEC[0] must be >= %u!\n", (1 << DAT_BITS));
-				ASSERT(HERE, 0, char_str);
+				ASSERT(0, char_str);
 			}
 		}
 
@@ -217,11 +217,11 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		}
 
 		if(mm*RADIX_VEC[NRADICES-1] != N2) {
-			ASSERT(HERE, 0, "product of radices not equal to complex vector length\n");
+			ASSERT(0, "product of radices not equal to complex vector length\n");
 		}
 
 /*		index = (int *)calloc(k,sizeof(int));	*/
-		index_ptmp = ALLOC_INT(index_ptmp, k);	if(!index_ptmp){ ASSERT(HERE, 0, "unable to allocate array INDEX in pairFFT_mul.\n"); }
+		index_ptmp = ALLOC_INT(index_ptmp, k);	if(!index_ptmp){ ASSERT(0, "unable to allocate array INDEX in pairFFT_mul.\n"); }
 		index      = ALIGN_INT(index_ptmp);
 
 		/*...Forward (DIF) FFT sincos data are in bit-reversed order. We define a separate last-pass twiddles
@@ -242,7 +242,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 			radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; break;
 		default :
 			sprintf(char_str, "radix[0] = %d not available.\n",RADIX_VEC[i]);
-			ASSERT(HERE, 0, char_str);
+			ASSERT(0, char_str);
 		}
 
 		for(i=1; i < NRADICES; i++)
@@ -279,7 +279,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 				radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; break;
 			default :
 				sprintf(char_str, "radix %d not available. Halting...\n",RADIX_VEC[i]);
-				ASSERT(HERE, 0, char_str);
+				ASSERT(0, char_str);
 			}
 		}
 		nradices_prim = l;
@@ -297,7 +297,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 */
 		  default :
 			sprintf(char_str, "ERROR: radix %d not available for _pairFFT dyadic-mul step.\n",RADIX_VEC[NRADICES-1]);
-			ASSERT(HERE, 0, char_str);
+			ASSERT(0, char_str);
 		}
 
 		return;
@@ -307,15 +307,15 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		/* If FORWARD_FFT_ONLY = TRUE, at least the X-ptr should be valid: */
 		n_inputs = 1;
 		if((uint32)FORWARD_FFT_ONLY > 2) {
-			ASSERT(HERE, 0, "FORWARD_FFT_ONLY not a any-nonzero-denotes-TRUE param: legal TRUE-values are 1 and 2!");
+			ASSERT(0, "FORWARD_FFT_ONLY not a any-nonzero-denotes-TRUE param: legal TRUE-values are 1 and 2!");
 		} else if(FORWARD_FFT_ONLY == 1) {
-			ASSERT(HERE, x != 0x0 && z == 0x0, "FORWARD_FFT_ONLY requires X-input nonzero and Z-input null!");
+			ASSERT(x != 0x0 && z == 0x0, "FORWARD_FFT_ONLY requires X-input nonzero and Z-input null!");
 			/* One or two inputs to be processed? */
 			ivec[0] = x;
 			ivec[1] = y;
 			n_inputs += (y != 0x0);
 		} else {	// FORWARD_FFT_ONLY = 0 and 2 behave similarly
-			ASSERT(HERE, x != 0x0 && y != 0x0, "FORWARD_FFT_ONLY = FALSE requires Non-null X,Y-inputs!");
+			ASSERT(x != 0x0 && y != 0x0, "FORWARD_FFT_ONLY = FALSE requires Non-null X,Y-inputs!");
 			/* One input to be processed: */
 			ivec[0] = x;
 			ab_mul = y; cd_mul = z;
@@ -343,14 +343,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		for(i = 0; i < NRADICES; i++) {
 			if(RADIX_VEC[i] == 0) {
 				sprintf(cbuf, "RADIX_VEC[i = %d] zero, for i < [NRADICES = %d]!",i,NRADICES);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			radix_set_save[i] = RADIX_VEC[i];
 		}
 		for(i = NRADICES; i < 10; i++) {
 			if(RADIX_VEC[i] != 0) {
 				sprintf(cbuf, "RADIX_VEC[i = %d] nonzero, for i >= [NRADICES = %d]!",i,NRADICES);
-				ASSERT(HERE, 0, cbuf);
+				ASSERT(0, cbuf);
 			}
 			radix_set_save[i] = 0;
 		}
@@ -360,7 +360,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		if(n%radix_vec0 != 0) {
 			sprintf(cbuf  ,"RADIX_VEC[0] does not divide N!\n");
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		/* Make sure n/RADIX_VEC[0] is a power of 2: */
@@ -368,7 +368,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		if((i >> trailz32(i)) != 1) {
 			sprintf(cbuf  ,"n/RADIX_VEC[0] not a power of 2!\n");
 			fprintf(stderr,"%s", cbuf);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		if(DAT_BITS < 31) {
@@ -376,14 +376,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 			if(i < (1 << DAT_BITS)) {
 				sprintf(cbuf  ,"vn/RADIX_VEC[0] must be >= %u!\n", (1 << DAT_BITS));
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 
 			/* We also have a lower limit on 2^DAT_BITS set by the pairFFT_mul routine: */
 			if((1 << DAT_BITS) < 2*RADIX_VEC[NRADICES-1]) {
 				sprintf(cbuf  ,"final FFT radix may not exceed = %u!\n", (1 << (DAT_BITS-1)));
 				fprintf(stderr,"%s", cbuf);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 		}
 
@@ -415,14 +415,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		NRT_BITS = (uint32)(log(sqrt(1.0*n))/log(2.0) + 0.5);	NRT = 1 << NRT_BITS;	NRTM1 = NRT - 1;
 		if(n%NRT) {
 			sprintf(cbuf,"ERROR: NRT does not divide N!\n");
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 
 		/*...The rt0 array stores the (0:NRT-1)th powers of the [N2]th root of unity
 		(i.e. will be accessed using the lower lg(NRT) bits of the integer sincos index):
 		*/
 		rt0_ptmp = ALLOC_COMPLEX(rt0_ptmp, NRT);
-		if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		rt0 = ALIGN_COMPLEX(rt0_ptmp);
 
 		qt     = i64_to_q((int64)N2);
@@ -443,7 +443,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		(and will be accessed using the upper bits, <NRT:31>, of the integer sincos index):
 		*/
 		rt1_ptmp = ALLOC_COMPLEX(rt1_ptmp, n/(2*NRT));
-		if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		rt1 = ALIGN_COMPLEX(rt1_ptmp);
 
 		qn     = i64_to_q((int64)NRT);
@@ -468,7 +468,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		/* 8/23/2004: Need to allocate an extra element here to account for the padding element that gets inserted when radix_vec0 is odd: */
 
 		block_index = (int *)calloc((radix_vec0+1),sizeof(int));
-		if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		/*
 		Examples - We only allow powers of 2 here, for the more general case cf. mers_mod_square.c:
 
@@ -522,7 +522,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 				// Do two loop executions:
 				for(j = 0; j < 2; j++)
 				{
-					if(!(l >= 0 && l < radix_vec0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+					if(!(l >= 0 && l < radix_vec0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 					block_index[ii] = l;	//fprintf(stderr,"%3d %3d\n",ii,l);
 					ii++;	// every time we execute this innermost loop (which corresponds to one
 							// block of FFT data being processed), increment the linear array index
@@ -543,14 +543,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		}		/* End of Main loop */
 
 		/* arrays storing the index values needed for the parallel-block wrapper/square scheme: */
-		if( !(ws_i            = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_j1           = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_j2           = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_j2_start     = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_k            = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_m            = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_blocklen     = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		if( !(ws_blocklen_sum = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if( !(ws_i            = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_j1           = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_j2           = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2           in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_j2_start     = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_k            = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_m            = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M            in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_blocklen     = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN     in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		if( !(ws_blocklen_sum = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 		/*...Final DIF pass, wrapper/squaring and initial DIT pass are all done in-place.
 			 This combines data from both the l1 and l2-block, except in the case ii = 0
@@ -583,7 +583,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 					*/
 					default :
 						sprintf(cbuf,"ERROR: Final radix %d not available for %s. Halting...\n",RADIX_VEC[NRADICES-1],func);
-						fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+						fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 				}
 			}
 		}
@@ -614,7 +614,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 			radix32_dif_pass1(a,n); break;
 		default :
 			sprintf(cbuf,"ERROR: radix %d not available for dif_pass1. Halting...\n",radix_vec0);
-			ASSERT(HERE, 0,cbuf);
+			ASSERT(0,cbuf);
 		}
 	  }
 		/* Break the remaining portion of the FFT into radix0 blocks, and in each pass of the resulting loop
@@ -648,10 +648,10 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 				case 32 :
 					ierr = radix32_ditN_cy_dif1      (a,n,  0,       0,0x0,0x0,0x0,0x0,0x0,0x0,     0x0,   0,&fracmax,0); break;
 				default :
-					sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix_vec0); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf);
+					sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix_vec0); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf);
 			}
 			/* Nonzero remaining carries are instantly fatal: */
-			ASSERT(HERE, ierr == 0, "pairFFT_mul: Fatal: carry routine return error!");
+			ASSERT(ierr == 0, "pairFFT_mul: Fatal: carry routine return error!");
 
 		/*...Now do the fractional error check. Any fractional part  > 0.40625 generates a warning...	*/
 		// Dec 2014: Bump threshold up from ( >= 0.4 ) to ( > 0.40625 ):
@@ -678,7 +678,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 		radix32_dit_pass1(a,n);	break;
 	  default :
 		sprintf(char_str, "radix %d not available for final IFFT pass!\n",radix_vec0);
-		ASSERT(HERE, 0, char_str);
+		ASSERT(0, char_str);
 	}
 
 	/*...And re-NINT the 'undo pass' data, which may differ from pure-int by some tiny amount: */
@@ -703,7 +703,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int
 	{
 		fprintf(stderr,"%s: max_fp > 0.01! Value = %20.10f\n",func,max_fp);
 		fprintf(stderr,"Check your build for inadvertent mixing of SSE2 and non-SSE2-enabled files!\n");
-		ASSERT(HERE, max_fp < 0.01,"max_fp < 0.01");
+		ASSERT(max_fp < 0.01,"max_fp < 0.01");
 	}
 
 	// Restore input value of MODULUS_TYPE:
@@ -754,7 +754,7 @@ if(FORWARD_FFT_ONLY != 2)	// Cf. comments in pairFFT_mul about this
 				radix32_dif_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break;
 			default :
 				sprintf(cbuf,"pairFFT_mul_process_chunk: ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 
 			k    += mm*radix_vec0;
@@ -783,7 +783,7 @@ if(FORWARD_FFT_ONLY != 2)	// Cf. comments in pairFFT_mul about this
 	*/
 		  default :
 			sprintf(char_str, "pairFFT_mul_process_chunk: ERROR: radix %d not available for dyadic mul step.\n",RADIX_VEC[NRADICES-1]);
-			ASSERT(HERE, 0, char_str);
+			ASSERT(0, char_str);
 		}
 	}
 	/* In forward-FFT-only mode, do none of the IFFT passes: */
@@ -797,7 +797,7 @@ if(FORWARD_FFT_ONLY != 2)	// Cf. comments in pairFFT_mul about this
 	{
 		/* Get block index of the chunk of contiguous data to be processed: */
 		l = block_index[ii + j];
-		ASSERT(HERE, l >= 0,"pair_FFTmul_process_chunk: l >= 0");
+		ASSERT(l >= 0,"pair_FFTmul_process_chunk: l >= 0");
 
 		/* Quick-n-dirty way of generating the correct starting values of k, mm and incr -
 		simply use the skeleton of the forward (DIF) loop, sans the i = NRADICES-2 pass
@@ -837,7 +837,7 @@ if(FORWARD_FFT_ONLY != 2)	// Cf. comments in pairFFT_mul about this
 				radix32_dit_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break;
 			default :
 				sprintf(cbuf,"pairFFT_mul_process_chunk: ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]);
-				ASSERT(HERE, 0,cbuf);
+				ASSERT(0,cbuf);
 			}
 		}	/* end i-loop */
 	}	/* end j-loop */
diff --git a/src/pm1.c b/src/pm1.c
index 017aee59..8fcf1d0b 100755
--- a/src/pm1.c
+++ b/src/pm1.c
@@ -43,7 +43,7 @@ Then to run, e.g.
 	uint32 PM1_S2_NBUF = 0;	// # of floating-double residue-length memblocks available for Stage 2
 	uint32 B1 = 0;
 	uint64 B2 = 0ull, B2_start = 0ull;
-	char cbuf[STR_MAX_LEN];
+	char cbuf[STR_MAX_LEN*2];
 	uint32 SYSTEM_RAM, MAX_RAM_USE;	// Total usable main memory size, and max. amount of that to use per instance, in MB
 	double MME;
 #else
@@ -202,9 +202,9 @@ uint32 pm1_set_bounds(const uint64 p, const uint32 n, const uint32 tf_bits, cons
 	// Force B1 >= 10^4 to avoid possible large-buffer-count underflow of qlo in stage 2.
 	// Conservatively use (#bits in Stage 1 prime-powers product ~= 1.5*B1), must fit into a uint32, thus B1_max = 2^33/3 = 2863311530:
 	i64 = p>>7;
-	ASSERT(HERE, i64 <= 2863311530ull, "Stage 1 prime-powers product must fit into a uint32; default B1 for your exponent is too large!");
+	ASSERT(i64 <= 2863311530ull, "Stage 1 prime-powers product must fit into a uint32; default B1 for your exponent is too large!");
 	B1 = MAX((uint32)i64,10000);	// #bits in Stage 1 prime-powers product ~= 1.4*B1, so e.g. B1 = p/128 gives a ~= 1.1*p/100 bits
-	B1 = (B1 + 99999)*inv100k;	B1 *= 100000;	ASSERT(HERE, B1 >= 100000, "B1 unacceptably small!");	// Round up to nearest 100k:
+	B1 = (B1 + 99999)*inv100k;	B1 *= 100000;	ASSERT(B1 >= 100000, "B1 unacceptably small!");	// Round up to nearest 100k:
 	if(PM1_S2_NBUF < 24) {
 		sprintf(cbuf,"pm1_set_bounds: Insufficient free memory for Stage 2 ... will run only Stage 1.\n");
 		mlucas_fprint(cbuf,pm1_standlone+1);
@@ -259,15 +259,15 @@ uint32 pm1_set_bounds(const uint64 p, const uint32 n, const uint32 tf_bits, cons
 		pm1_bigstep_size(&PM1_S2_NBUF, &bigstep, &stage2_mem_multiple,psmall);
 		if(bigstep != 210 && bigstep != 330 && bigstep != 420 && bigstep != 660 && bigstep != 840) {
 			sprintf(cbuf,"%u is unsupported value of bigstep!",bigstep);
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 		}
 		double f2 = 30.0/(1 - 0.93*log10(log10(0.37037037037037037037*(double)PM1_S2_NBUF)));
 		B2 = (uint64)(f2*(double)B1);
 		// Round to nearest 1m:
-		B2 = (B2 + 999999)*inv1m;	B2 *= 1000000;	ASSERT(HERE, B2 >= 1000000, "B2 unacceptably small!");
+		B2 = (B2 + 999999)*inv1m;	B2 *= 1000000;	ASSERT(B2 >= 1000000, "B2 unacceptably small!");
 	}
 	pm1_check_bounds();	// This sanity-checks the bounds and sets B2_start = B1 if unset.
-	sprintf(cbuf,"Setting default p-1 stage bounds b1 = %u, b2_start = %llu, b2 = %llu.\n",B1,B2_start,B2);
+	sprintf(cbuf,"Setting default p-1 stage bounds b1 = %u, b2_start = %" PRIu64 ", b2 = %" PRIu64 ".\n",B1,B2_start,B2);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 	return 1;
 }
@@ -283,17 +283,17 @@ uint32 pm1_check_bounds() {
 		if(B1 < 10000) { sprintf(cbuf,"The minimum P-1 Stage 1 bound = 10000; resetting to that.\n"); mlucas_fprint(cbuf,pm1_standlone+1); B1 = 10000; }
 	#endif
 		if(B2_start) {
-			if(B2_start > B2) { sprintf(cbuf,"P-1 Stage 2 starting bound [= %llu] must be less than or equal to Stage 2 bound [= %llu].\n",B2_start,B2); break; }
-			if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %llu] must be greater than or equal to that of Stage 1 [= %u].\n",B2,B1); break; }
+			if(B2_start > B2) { sprintf(cbuf,"P-1 Stage 2 starting bound [= %" PRIu64 "] must be less than or equal to Stage 2 bound [= %" PRIu64 "].\n",B2_start,B2); break; }
+			if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %" PRIu64 "] must be greater than or equal to that of Stage 1 [= %u].\n",B2,B1); break; }
 		} else if(B2) {	// Stage 2 takes off where Stage 1 left off
-			if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %llu] set nonzero but < Stage 1 bound [= %u] ... no Stage 2 will be run.\n",B2,B1); }
+			if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %" PRIu64 "] set nonzero but < Stage 1 bound [= %u] ... no Stage 2 will be run.\n",B2,B1); }
 			B2_start = B1;
 		} else {	// No Stage 2 - Can set both of these to 0 or B1 in this case
 			B2_start = B2 = (uint64)0;
 		}
 		return 1;	// B1 and B2 legal.
 	}
-	mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+	mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 	return 0;	// Bzzt!
 }
 
@@ -333,7 +333,7 @@ global would be needed to store that - and remultiply by the appropriate one for
 */
 uint32 compute_pm1_s1_product(const uint64 p) {
 	const double A = 1.1;
-	ASSERT(HERE, B1 > 0, "Call to compute_pm1_s1_product needs Stage 1 bound global B1 to be set!");
+	ASSERT(B1 > 0, "Call to compute_pm1_s1_product needs Stage 1 bound global B1 to be set!");
 	double ln = log(B1), lg = ln*ILG2;
 	uint32 i,len = 0,nmul,nbits,ebits = (uint32)((lg-A)*B1/(ln-A));
 	uint64 iseed,maxmult;
@@ -344,7 +344,7 @@ uint32 compute_pm1_s1_product(const uint64 p) {
 	PM1_S1_PRODUCT = ALLOC_UINT64(PM1_S1_PRODUCT, s1p_alloc);
 	if(!PM1_S1_PRODUCT ){
 		sprintf(cbuf, "ERROR: unable to allocate array PM1_S1_PRODUCT with %u linbs in main.\n",s1p_alloc);
-		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 	}
 
 	// (E.g. on restart) First see if a savefile holding the precomputed/bit-reversed product for this p and B1 exists:
@@ -359,22 +359,22 @@ uint32 compute_pm1_s1_product(const uint64 p) {
   #endif
 		// For M(p) want to seed the S1 prime-powers product with 2*p; for F(m) we want seed = 2^(m+2). Since in the latter
 		// case our input p contains 2^m, can handle both cases via iseed = 4*p, giving an extra *2 in the Mersenne case:
-		iseed = p<<2;	ASSERT(HERE, (iseed>>2) == p,"Binary exponent overflows (uint64)4*p in compute_pm1_s1_product!");
+		iseed = p<<2;	ASSERT((iseed>>2) == p,"Binary exponent overflows (uint64)4*p in compute_pm1_s1_product!");
 		len = pm1_s1_ppow_prod(iseed, B1, PM1_S1_PRODUCT, &nmul, &maxmult);	PM1_S1_PROD_B1 = B1;
 		nbits = (len<<6)-mi64_leadz(PM1_S1_PRODUCT,len);
 		if(len > s1p_alloc) {
 			sprintf(cbuf,"Size of S1 prime-powers product exceeds alloc of PM1_S1_PRODUCT[]!");
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 		}
 	/*
 		fprintf(stderr,"Product of Stage 1 prime powers used %u mi64_mul_scalar() calls; max-multiplier %u bits\n",nmul, 64-leadz64(maxmult));
 		fprintf(stderr,"Limbs of PM1_S1_PRODUCT, low to high:\n");
 		for(i = 0; i < len; i+=8) {
-			fprintf(stderr,"%llx,%llx,%llx,%llx,%llx,%llx,%llx,%llx\n",PM1_S1_PRODUCT[i],PM1_S1_PRODUCT[i+1],PM1_S1_PRODUCT[i+2],PM1_S1_PRODUCT[i+3],PM1_S1_PRODUCT[i+4],PM1_S1_PRODUCT[i+5],PM1_S1_PRODUCT[i+6],PM1_S1_PRODUCT[i+7]);
+			fprintf(stderr,"%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 "\n",PM1_S1_PRODUCT[i],PM1_S1_PRODUCT[i+1],PM1_S1_PRODUCT[i+2],PM1_S1_PRODUCT[i+3],PM1_S1_PRODUCT[i+4],PM1_S1_PRODUCT[i+5],PM1_S1_PRODUCT[i+6],PM1_S1_PRODUCT[i+7]);
 		}
 		exit(0);
 	*/
-	//	fprintf(stderr,"PM1_S1_PRODUCT limbs[%u,%u,...,1,0] = %016llX,%016llX,...,%016llX,%016llX\n",len-1,len-2,PM1_S1_PRODUCT[len-1],PM1_S1_PRODUCT[len-2],PM1_S1_PRODUCT[1],PM1_S1_PRODUCT[0]);
+	//	fprintf(stderr,"PM1_S1_PRODUCT limbs[%u,%u,...,1,0] = %016" PRIX64 ",%016" PRIX64 ",...,%016" PRIX64 ",%016" PRIX64 "\n",len-1,len-2,PM1_S1_PRODUCT[len-1],PM1_S1_PRODUCT[len-2],PM1_S1_PRODUCT[1],PM1_S1_PRODUCT[0]);
 		// Ignore the #iters != 0 user needed to set to invoke selfTest mode, replace with nbits in S1 prime-powers product:
 		PM1_S1_PROD_BITS = nbits-1;	// Leftmost bit accounted for by setting initial seed in the LR-modular binary powering
 		// Bit-reverse s1 product, leaving leftmost 1-bit off. REMEMBER, this puts the 0-bits corresponding to the
@@ -393,7 +393,7 @@ uint32 compute_pm1_s1_product(const uint64 p) {
   #ifndef PM1_STANDALONE
 		// Write result to savefile:
 		if(!write_pm1_s1_prod(savefile, p, PM1_S1_PROD_BITS, PM1_S1_PRODUCT, PM1_S1_PROD_RES64)) {
-			snprintf_nowarn(cbuf,STR_MAX_LEN,"WARN: Unable to write precomputed/bit-reversed Stage 1 prime-powers product to savefile %s.\n",savefile);
+			snprintf(cbuf,STR_MAX_LEN*2,"WARN: Unable to write precomputed/bit-reversed Stage 1 prime-powers product to savefile %s.\n",savefile);
 			mlucas_fprint(cbuf,pm1_standlone+1);
 		}
 	} 	// endif(read_pm1_s1_prod)
@@ -401,7 +401,7 @@ uint32 compute_pm1_s1_product(const uint64 p) {
 	sprintf(cbuf,"Product of Stage 1 prime powers with b1 = %u is %u bits (%u limbs), vs estimated %u. Setting PRP_BASE = 3.\n",B1,PM1_S1_PROD_BITS+1,len,ebits);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 	PRP_BASE = 3;
-	sprintf(cbuf,"BRed (PM1_S1_PRODUCT sans leading bit) has %u limbs, Res64 = %llu\n",len,PM1_S1_PROD_RES64);
+	sprintf(cbuf,"BRed (PM1_S1_PRODUCT sans leading bit) has %u limbs, Res64 = %" PRIu64 "\n",len,PM1_S1_PROD_RES64);
 	mlucas_fprint(cbuf,pm1_standlone+0);
 	return len;	// return actual #limbs of product, not initial overestimate
 }
@@ -412,8 +412,8 @@ uint32 pm1_s1_ppow_prod(const uint64 iseed, const uint32 b1, uint64 accum[], uin
 	uint32 p = 2,i,j,len,maxbits = 64-leadz64(b1);
 	uint32 loop = 64/maxbits;	// Number of prime-powers we can accumulate inside inner loop while remaining < 2^64
 	uint64 tmp,prod,mult,cy = 0ull;
-	ASSERT(HERE, accum != 0x0, "Null accum[] pointer in s1_ppow_prod()");
-	ASSERT(HERE, accum != 0x0, "Zero initial seed in s1_ppow_prod()");
+	ASSERT(accum != 0x0, "Null accum[] pointer in s1_ppow_prod()");
+	ASSERT(accum != 0x0, "Zero initial seed in s1_ppow_prod()");
 	accum[0] = iseed; len = 1; *nmul = 0; *maxmult = 0ull;
 // Debug-only - allows testing of S1 on known-factor case without actually running S2:
 #if 0
@@ -422,10 +422,10 @@ uint32 pm1_s1_ppow_prod(const uint64 iseed, const uint32 b1, uint64 accum[], uin
 	mult = 140091319777ull;
 	cy = mi64_mul_scalar(accum, mult, accum, len);	++*nmul;
 	accum[len] = cy; len += (cy != 0ull);
-	fprintf(stderr,"Pre-loop accumulator = %llu + 2^64*%llu",accum[0],accum[1]);
+	fprintf(stderr,"Pre-loop accumulator = %" PRIu64 " + 2^64*%" PRIu64,accum[0],accum[1]);
   }
 #endif
-//	fprintf(stderr,"Stage 1 exponent = %llu.",accum[0]);
+//	fprintf(stderr,"Stage 1 exponent = %" PRIu64 ".",accum[0]);
 	while(p < b1) {
 		mult = 1ull;
 		for(i = 0; i < loop; i++) {
@@ -457,8 +457,8 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin
 	uint8 c;
 	uint32 i,j,b1 = 0,nbytes,nlimbs;
 	uint64 itmp64 = 0ull,isum64 = 0ull;
-	ASSERT(HERE, arr != 0x0, "Null arr pointer!");
-	ASSERT(HERE, strlen(fname) != 0, "Empty filename!");
+	ASSERT(arr != 0x0, "Null arr pointer!");
+	ASSERT(strlen(fname) != 0, "Empty filename!");
   #ifdef PM1_STANDALONE
 	FILE*fptr = 0x0;
 	goto PM1_S1P_READ_RETURN;
@@ -507,7 +507,7 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin
 	}
 	for(i = 0; i < nlimbs; i++) { itmp64 += arr[i]; }
 	if(itmp64 != isum64) {
-		sprintf(cbuf, "INFO: %s: Computed checksum[%llX] mismatches one[%llX] appended to savefile data.\n",func,itmp64,isum64);
+		sprintf(cbuf, "INFO: %s: Computed checksum[%" PRIX64 "] mismatches one[%" PRIX64 "] appended to savefile data.\n",func,itmp64,isum64);
 		*sum64 = 0ull;
 		goto PM1_S1P_READ_RETURN;
 	} else {
@@ -529,13 +529,13 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin
 		uint8 c;
 		uint32 i,j,b1 = 0,nbytes,nlimbs;
 		uint64 itmp64 = 0ull;
-		ASSERT(HERE, arr != 0x0, "Null arr pointer!");
-		ASSERT(HERE, strlen(fname) != 0, "Empty filename!");
+		ASSERT(arr != 0x0, "Null arr pointer!");
+		ASSERT(strlen(fname) != 0, "Empty filename!");
 
 		FILE*fptr = mlucas_fopen(fname, "wb");
 		if(!fptr) {
 			sprintf(cbuf,"ERROR: Unable to open precomputed p-1 stage 1 primes-product file %s for writing.\n",fname);
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0, cbuf);
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0, cbuf);
 		}
 		fprintf(stderr,"INFO: Opened precomputed p-1 stage 1 primes-product file %s for writing...\n",fname);
 
@@ -561,7 +561,7 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin
 		// Write 8 bytes of simple (sum of limbs, mod 2^64) checksum, after comparing arglist version to one computed from actual data:
 		for(i = 0; i < nlimbs; i++) { itmp64 += arr[i]; }
 		if(itmp64 != sum64) {
-			sprintf(cbuf, "INFO: %s: Computed checksum[%llX] mismatches one[%llX] in arglist.\n",func,itmp64,sum64);
+			sprintf(cbuf, "INFO: %s: Computed checksum[%" PRIX64 "] mismatches one[%" PRIX64 "] in arglist.\n",func,itmp64,sum64);
 			goto PM1_S1P_WRITE_RETURN;
 		}
 		for(j = 0; j < 64; j += 8) {
@@ -769,7 +769,7 @@ void pm1_bigstep_size(uint32*nbuf, uint32*bigstep, uint32*m, uint32 psmall)
 	else if(psmall == 11)
 		lut = lut_psmall11;
 	else
-		ASSERT(HERE, 0, "pm1_bigstep_size: Bad input value of relocation-prime!");
+		ASSERT(0, "pm1_bigstep_size: Bad input value of relocation-prime!");
 	// High-RAM case - For given D and associated num_b, M = floor(nbuf/num_b), where num_b = 24|40|48|80|96
 	// for D = 210|330|420|660|840. Only need to special-case psmall = 7 here, all others use D = 840:
 	if(*nbuf >= 10000) {
@@ -790,14 +790,14 @@ void pm1_bigstep_size(uint32*nbuf, uint32*bigstep, uint32*m, uint32 psmall)
 		if(lut[i] > *nbuf) break;
 	}
 	if(!i)
-		ASSERT(HERE, 0, "P-1 stage 2 with relocation prime psmall = 7|11 needs at least 40|24 buffers of available RAM, respectively!");
+		ASSERT(0, "P-1 stage 2 with relocation prime psmall = 7|11 needs at least 40|24 buffers of available RAM, respectively!");
 	if(psmall) {
 		sprintf(cbuf,"Previous Stage 2 work used relocation-prime %u ... enforcing compatibility with this: bigstep must be a multiple of %u.\n",psmall,18-psmall);
 		mlucas_fprint(cbuf,pm1_standlone+1);
 		// Here's why we don't declare psmall const in the arglist - it stores the smallest prime which does
 		// not divide the bigstep value, in order to check divisibility replace it by its complement here:
 		psmall = 18-psmall;
-		ASSERT(HERE, (lut[i-1]%psmall == 0), "P-1 stage 2 needs at least 24 buffers of available RAM!");
+		ASSERT((lut[i-1]%psmall == 0), "P-1 stage 2 needs at least 24 buffers of available RAM!");
 		/* First-go-round of this used just a single unified lut[] array and worked backward to the largest nbuf whoe D is compatible:
 		for( ; ; i -= 2) {
 			if(lut[i-1]%psmall == 0) break;
@@ -839,7 +839,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow,
 #ifdef PM1_DEBUG
 	uint32 j; double dsum;
 #endif
-	ASSERT(HERE,a && b && n && func_mod_square,"Null input pointer or vector length in pm1.c::modpow!");
+	ASSERT(a && b && n && func_mod_square,"Null input pointer or vector length in pm1.c::modpow!");
 	// pow = 0: , b[1:n-1] = 0:
 	if(!pow) {
 		b[0] = 1.0;
@@ -853,7 +853,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow,
 	}
 	// Init b = fwdFFT(a); only need this if power != 2^k, in which case we only need autosquarings:
 #ifdef PM1_DEBUG
-	fprintf(stderr,"MODPOW: pow = %llu\n",pow);
+	fprintf(stderr,"MODPOW: pow = %" PRIu64 "\n",pow);
 #endif
 	if(!isPow2_64(pow)) {
 		memcpy(b,a,nbytes);	// b = a           vvvv + 4 to effect "Do in-place forward FFT only; low bit = 0 here implies pure-int input"
@@ -864,7 +864,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow,
 	#ifdef PM1_DEBUG
 		dsum = 0; for(j = 0; j < npad; j++) { dsum += fabs(b[j]); }; fprintf(stderr,"b = fwdFFT(a) gives b[0] = %20.8f, b[1] = %20.8f, L1(b) = %20.8f\n",b[0],b[1],dsum/n); MME = 0;
 	#endif
-	}	ASSERT(HERE, nerr == 0, "func_mod_square returns error!");
+	}	ASSERT(nerr == 0, "func_mod_square returns error!");
 	// Use LR binary modpow algorithm, though it's no faster in this general case than RL:
 	uint32 len = nbits64(pow);
 	pow = reverse64(pow,len)>>1;	// Leftmost bit of input power accounted for by implied (result = a[]) init;
@@ -904,7 +904,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow,
 	  }
 	}
 	// For initial release, no error handling - note we do have ROE handling in the main Stage 2 loop:
-	if(nerr != 0) { sprintf(cbuf,"modpow hit one or more errors! Aborting."); ASSERT(HERE,0,cbuf); }
+	if(nerr != 0) { sprintf(cbuf,"modpow hit one or more errors! Aborting."); ASSERT(0,cbuf); }
 	// Result returned in a[]:
 	return (nerr != 0);
 }
@@ -1038,16 +1038,16 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 			fprintf(stderr, "*** ERROR: Unrecognized flag %s.\n", stFlag);	return 1;
 		}
 	}
-	ASSERT(HERE, bigstep && B1 && B2 && m, "All 4 args bigstep,b1,b2,m must be set > 0!");
+	ASSERT(bigstep && B1 && B2 && m, "All 4 args bigstep,b1,b2,m must be set > 0!");
 	B2_start = (uint64)B1;
   #else
 	// Check function pointer to [mers|fermat]_mod_square based on modulus type:
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
-		ASSERT(HERE, func_mod_square == mers_mod_square  , "Mod-square function pointer incorrectly set in pm1_stage2!");
+		ASSERT(func_mod_square == mers_mod_square  , "Mod-square function pointer incorrectly set in pm1_stage2!");
 	else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
-		ASSERT(HERE, func_mod_square == fermat_mod_square, "Mod-square function pointer incorrectly set in pm1_stage2!");
+		ASSERT(func_mod_square == fermat_mod_square, "Mod-square function pointer incorrectly set in pm1_stage2!");
 	else
-		ASSERT(HERE,0,"Modulus type not set in pm1_stage2!");
+		ASSERT(0,"Modulus type not set in pm1_stage2!");
   #endif
 
   #ifndef PM1_STANDALONE
@@ -1087,7 +1087,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	} else {	// In the case of a standalone S2 interval (B2_small > B1), set psmall = 0 and reloc_start = UINT64_MAX:
 		psmall = 0; reloc_start = -1ull;
 	}
-	sprintf(cbuf,"Using B2_start = %llu, B2 = %llu, Bigstep = %u, M = %u\n",B2_start,B2,bigstep,m);
+	sprintf(cbuf,"Using B2_start = %" PRIu64 ", B2 = %" PRIu64 ", Bigstep = %u, M = %u\n",B2_start,B2,bigstep,m);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 	uint32 reloc_on = FALSE;	// Gets switched to TRUE (= start using semiprimes which are multiples of psmall) when q > reloc_start
 
@@ -1099,8 +1099,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 		tmp = (q0/(uint64)bigstep - 1)<<1;	// tmp holds m_max as a uint64
 		// For this condition to be hit implies q0 quite small, but makes sure resulting m is 32-bit anyway:
 		if((uint64)m < tmp) {
-			sprintf(cbuf, "Nonsensical value of M_max = %llu in qlo-underflow check ... aborting.",tmp);
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+			sprintf(cbuf, "Nonsensical value of M_max = %" PRIu64 " in qlo-underflow check ... aborting.",tmp);
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 		}
 		m = tmp-1;
 		PM1_S2_NBUF = m*num_b;	// Don't use PM1_S2_NBUF per se in code below, but reset for consistency
@@ -1111,7 +1111,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	// May 2021: Added support for M even:
 	m_is_odd = IS_ODD(m);
 	m_is_even = !m_is_odd;
-	ASSERT(HERE, RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n");	// Need BASE_MULTIPLIER_BITS array = 0 for modmuls below!
+	ASSERT(RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n");	// Need BASE_MULTIPLIER_BITS array = 0 for modmuls below!
 	// Alloc the needed memory:
   #ifndef PM1_STANDALONE
 	nlimb = (p+63+(MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6;	// # of 64-bit limbs in p-bit vector, alloc 2 of these for debug:
@@ -1132,20 +1132,20 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	j = 0;
 	if(nalloc & 7)
 		j = 8 - (nalloc & 7);
-	nalloc += j;	ASSERT(HERE, (nalloc & 7) == 0,"nalloc must be a multiple of 8!");	// Ensure 64-byte alignment of a[]
+	nalloc += j;	ASSERT((nalloc & 7) == 0,"nalloc must be a multiple of 8!");	// Ensure 64-byte alignment of a[]
 	// double*a holds ptr to 1 scratch vector, double**buf holds ptrs to num_b*m double-vecs of same length npad:
 	a_ptmp = ALLOC_DOUBLE(a_ptmp, nalloc);
 	if(!a_ptmp){
 		sprintf(cbuf, "ERROR: unable to allocate the needed %u buffers of p-1 Stage 2 storage.\n",num_b*m + use_pp1);
-		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 	}
-	a      = ALIGN_DOUBLE(a_ptmp);	ASSERT(HERE, ((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
+	a      = ALIGN_DOUBLE(a_ptmp);	ASSERT(((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
 	buf = (double **)calloc(num_b*m,sizeof(double *));
 	// ...and num_b*m "buffers" for precomputed bigstep-coprime odd-square powers of the stage 1 residue:
 	for(i = 0; i < num_b*m; i++) {
 		buf[i] = a + i*npad;
-//		fprintf(stderr,"buf[%3d] = 0x%llX\n",i,(uint64)buf[i]);
-		ASSERT(HERE, ((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
+//		fprintf(stderr,"buf[%3d] = %#" PRIX64 "\n",i,(uint64)buf[i]);
+		ASSERT(((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
 	}
 	// Still do fwdFFT(1) as init-FFT step in non-(p+1) build, but use uppermost buf[] entry to hold as throwaway result:
 	vone = a + (i - 1 + use_pp1)*npad;
@@ -1168,8 +1168,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	pthread_attr_init(&attr);
 	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
 	const int nbytes_simd_align = (RE_IM_STRIDE*8) - 1;	// And per-thread data chunk addresses with this to check SIMD alignment
-	ASSERT(HERE, ((intptr_t)mult[0] & nbytes_simd_align) == 0x0,"mult[0] not aligned on 64-byte boundary!");
-	ASSERT(HERE, ((intptr_t)buf [0] & nbytes_simd_align) == 0x0,"buf [0] not aligned on 64-byte boundary!");	// Since npad a multiple of RE_IM_STRIDE, only need to check buf[0] alignment
+	ASSERT(((intptr_t)mult[0] & nbytes_simd_align) == 0x0,"mult[0] not aligned on 64-byte boundary!");
+	ASSERT(((intptr_t)buf [0] & nbytes_simd_align) == 0x0,"buf [0] not aligned on 64-byte boundary!");	// Since npad a multiple of RE_IM_STRIDE, only need to check buf[0] alignment
 	j = npad / NTHREADS;	// j = #doubles in each thread-processed chunk
 	/* Fiddle up-or-downward to make it a multiple of RE_IM_STRIDE; say this == 8. Since j == (npad/NTHREADS) - [0 or 1]
 	due to truncation-on-integer-div, if jmod := (j % RE_IM_STRIDE) < RE_IM_STRIDE/2, subtract jmod from j, otherwise
@@ -1190,15 +1190,15 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 		tdat[i].n = j;	// Chunksize
 	}
 	tdat[NTHREADS-1].n = npad - (NTHREADS-1)*j;	// Fiddle the last thread's chunksize so the sum == npad
-	ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, NTHREADS, &thread_control)), "threadpool_init failed!");
+	ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, NTHREADS, &thread_control)), "threadpool_init failed!");
 	printf("%s: Init threadpool of %d threads\n",func,NTHREADS);
    #endif	// PM1_STANDALONE?
   #endif	// MULTITHREAD?
 	// Integer arrays:
-	b = malloc(m*(bigstep>>1)*sizeof(uint32));	ASSERT(HERE, b != NULL, "B[]-array alloc failed!");
+	b = malloc(m*(bigstep>>1)*sizeof(uint32));	ASSERT(b != NULL, "B[]-array alloc failed!");
 	/* Jun 2021: added (psmall) map words for psmall = (mod 7|11) bitmap needed to support small-prime relocation -
 	optimization - This needs wsize bytes, hence the (...+1)*wsize: */
-	map = calloc((m+2+1)*wsize,sizeof(uint8));	ASSERT(HERE, map != NULL, "map[]-array alloc failed!");
+	map = calloc((m+2+1)*wsize,sizeof(uint8));	ASSERT(map != NULL, "map[]-array alloc failed!");
 	// 2 extra word-slots at high end of map used for these temps - can't declare as const pointers,0x but treat as such below:
 	lo = map + m*wsize; hi = lo + wsize;
 	rmap = hi + wsize;
@@ -1266,7 +1266,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 		retval = 1; goto ERR_RETURN;
 	}
 	for(j = 2*num_b; j < m*num_b; j++) {
-		ASSERT(HERE,b[j] == bigstep + b[j-2*num_b],"Bigstep-power-offset check fails!");
+		ASSERT(b[j] == bigstep + b[j-2*num_b],"Bigstep-power-offset check fails!");
 	}
 
 #if !defined(PM1_STANDALONE) && defined(PM1_DEBUG)
@@ -1282,17 +1282,17 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	ierr += func_mod_square(mult[1], 0x0, n, 0,1,(uint64)a + mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);/* and done. */
 	if(ierr != 0) {
 		sprintf(cbuf,"Modmul test hit an error of type = %u! Aborting.",ierr);
-		ASSERT(HERE,0,cbuf);
+		ASSERT(0,cbuf);
 	}
 	convert_res_FP_bytewise(mult[0],(uint8*)vec1, n, p, 0x0,0x0,0x0);
 	convert_res_FP_bytewise(mult[1],(uint8*)vec2, n, p, 0x0,0x0,0x0);
-	ASSERT(HERE, mi64_cmp_eq(vec1,vec2,nlimb), "Modmul-test results mismatch!");
+	ASSERT(mi64_cmp_eq(vec1,vec2,nlimb), "Modmul-test results mismatch!");
   /********************************************************************************/
   /********* Known-stage-2-factor tests, starting with a stage 1 residue: *********/
   /********************************************************************************/
   // F31: Do a single stage-1-result-powering (pow^140091319777 - 1) and make sure the known factor divides the result:
   if(p == 2147483648) {
-	ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!");
+	ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!");
 	input_is_int = TRUE;
 	memcpy(a,pow,nbytes);
 	modpow(a, mult[0], input_is_int, 140091319777ull, func_mod_square, p, n, scrnFlag,&tdif2);
@@ -1304,14 +1304,14 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	// In fact, F31 has nlimb+1 words, but the only way a p-1 residue R has the same high bit
 	// set as F31 iif R == F31 (uninteresting) or R == 2^2^31, which implies GCD == 1:
 	int isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem);
-	ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!");
+	ASSERT(isfact != 0, "Failed to find known stage 2 factor!");
 	fprintf(stderr,"%s p-1 known-stage-2 prime stage 1 powering success!\n",PSTRING);
   }
   // M(139788679): Do a stage-1-result-powering (pow^a - 1) with a = 9952471 and make sure the corresponding
   // known factor, q = 842944537391616 = 2.k.p+1 with k = 2^9.3^2.11^2.29.37.1187^2, divides the result.
   // With B1 < 1187^2 = 1408969 this factor is not found after stage 1 since this prime appears only as a single-power:
   if(p == 139788679) {
-	ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!");
+	ASSERT(MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!");
 	// A^4002923: Use mult[0] as scratch array for modpow():
 	input_is_int = TRUE;
 	memcpy(a,pow,nbytes);
@@ -1322,7 +1322,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	convert_res_FP_bytewise(a,(uint8*)vec1, n, p, 0x0,0x0,0x0);
 	uint64 rem[2] = {0ull,0ull}, q[2] = {11051162840690736129ull,12775ull};	// q = 1314651028704963254300497
 	int isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem);
-	ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!");
+	ASSERT(isfact != 0, "Failed to find known stage 2 factor!");
 	fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING);
 	exit(0);
   }
@@ -1372,7 +1372,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	*/
 	vone[0] = 1.0; for(j = 1; j < n; j++) { vone[j] = 0.0; }
 													//vvv-- Pure-int inputs, so mode_flag = 0
-	ierr = func_mod_square(vone, (void *)a, n, 0,1, 4ull, p, scrnFlag,&tdif2, FALSE, 0x0);	ASSERT(HERE, ierr == 0,"fwdFFT(1) hit error!");
+	ierr = func_mod_square(vone, (void *)a, n, 0,1, 4ull, p, scrnFlag,&tdif2, FALSE, 0x0);	ASSERT(ierr == 0,"fwdFFT(1) hit error!");
 
   #if !USE_PP1_MULTS		// Basic version:
 
@@ -1413,8 +1413,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 		#ifdef PM1_DEBUG
 			fprintf(stderr,"%u^2.",j);
 		#endif
-//			fprintf(stderr,"buf[%3d] = 0x%llX\n",i,(uint64)buf[i]);
-			ASSERT(HERE, ((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
+//			fprintf(stderr,"buf[%3d] = %#" PRIX64 "\n",i,(uint64)buf[i]);
+			ASSERT(((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
 			memcpy(buf[i++],mult[0],nbytes);	// buf[i++] = mult[0] = fwd-FFT-pass-1-done(A^1,9,25,...)
 		}
 		// Up-multiply the fwd-FFT-pass-1-done(A^8,16,24,...) by fixed multiplier fwd-FFT(A^8):
@@ -1430,7 +1430,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
 	// vec1,vec2 will hold stage 1 residue A (stored in FP form in pow[]) and its mod-inverse in packed-bit form
 	vec1[nlimb-1] = 0ull;
 	convert_res_FP_bytewise(pow,(uint8*)vec1, n, p, &Res64,&Res35m1,&Res36m1);
-fprintf(stderr,"#1: vec1 = A^+1 checksums = %llu,%llu,%llu; FP(A)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, pow[0],pow[1]);
+fprintf(stderr,"#1: vec1 = A^+1 checksums = %" PRIu64 ",%" PRIu64 ",%" PRIu64 "; FP(A)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, pow[0],pow[1]);
 	// First see if there's a savefile-copy of the s1 residue-inverse:
 	strcpy(inv_file, RESTARTFILE);
 	inv_file[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f');
@@ -1440,20 +1440,20 @@ fprintf(stderr,"#1: vec1 = A^+1 checksums = %llu,%llu,%llu; FP(A)[0:1] = %10.2f,
 	if(fp) {												// G-check residue fields all set NULL in this call:
 		i = read_ppm1_savefiles(inv_file, p, &k, fp, &tmp, (uint8*)vec2, &Res64,&Res35m1,&Res36m1, 0x0,0x0,0x0,0x0);
 		fclose(fp); fp = 0x0;
-		ASSERT(HERE, tmp == 0ull, "Stage 1 residue-inverse savefile should have nsquares == 0!");
+		ASSERT(tmp == 0ull, "Stage 1 residue-inverse savefile should have nsquares == 0!");
 		if(!i) {
 			/* First print any error message that may have been issued during the above function call: */
 			if(strstr(cbuf, "read_ppm1_savefiles"))
 				mlucas_fprint(cbuf,pm1_standlone+1);
 			// And now for the official spokesmessage:
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "Read of stage 1 residue-inverse savefile %s failed for reasons unknown. Computing inverse...\n",inv_file);
+			snprintf(cbuf,STR_MAX_LEN*2, "Read of stage 1 residue-inverse savefile %s failed for reasons unknown. Computing inverse...\n",inv_file);
 			mlucas_fprint(cbuf,pm1_standlone+1);
 		} else {
 			s1_inverse = TRUE;
 		}
 	}
 	if(!s1_inverse) {
-		snprintf(cbuf,STR_MAX_LEN, "Stage 2: Computing mod-inverse of Stage 1 residue...\n");	mlucas_fprint(cbuf,pm1_standlone+1);
+		snprintf(cbuf,STR_MAX_LEN*2, "Stage 2: Computing mod-inverse of Stage 1 residue...\n");	mlucas_fprint(cbuf,pm1_standlone+1);
 		modinv(p,vec1,vec2,nlimb);	// Result in vec2
 		Res64 = vec2[0];
 		Res35m1 = mi64_div_by_scalar64(vec2,two35m1,nlimb,0x0);
@@ -1464,12 +1464,12 @@ fprintf(stderr,"#1: vec1 = A^+1 checksums = %llu,%llu,%llu; FP(A)[0:1] = %10.2f,
 			write_ppm1_savefiles(inv_file,p,n,fp, 0ull, (uint8*)vec2,Res64,Res35m1,Res36m1, 0x0,0x0,0x0,0x0);
 			fclose(fp);	fp = 0x0;
 		} else {
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open restart file %s for write of checkpoint data.\n",inv_file);
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+			snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open restart file %s for write of checkpoint data.\n",inv_file);
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 		}
 	}
 	convert_res_bytewise_FP((uint8*)vec2, a, n, p);	// Use a[] to hold inverse A^-1 until done with it
-fprintf(stderr,"#1: vec2 = A^-1 checksums = %llu,%llu,%llu; FP(A^-1)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, a[0],a[1]);
+fprintf(stderr,"#1: vec2 = A^-1 checksums = %" PRIu64 ",%" PRIu64 ",%" PRIu64 "; FP(A^-1)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, a[0],a[1]);
    #ifdef PM1_DEBUG
 	fprintf(stderr,"Checking mod-inverse...\n");
 	// Debug: check inverse ... start with copies of A (pow[]) and A^-1 (a[]) into mult0-1:
@@ -1479,8 +1479,8 @@ fprintf(stderr,"#1: vec2 = A^-1 checksums = %llu,%llu,%llu; FP(A^-1)[0:1] = %10.
 	// mult[0] = A * A^-1, check that result = 1 as expected:
 	mode_flag = 0;	// bits 0:1 of mode_flag = 0, since mult[0] enters in pure-int form and want output the same way
 	ierr += func_mod_square(mult[0], 0x0, n, 0,1, (uint64)mult[1] + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);
-	ASSERT(HERE, mult[0][0] == 1.0, "inverse-check fails!");
-	for(i = 1; i < npad; i++) { ASSERT(HERE, mult[0][i] == 0.0, "inverse-check fails!"); }
+	ASSERT(mult[0][0] == 1.0, "inverse-check fails!");
+	for(i = 1; i < npad; i++) { ASSERT(mult[0][i] == 0.0, "inverse-check fails!"); }
    #endif
 	/*** NOTE: vec1,vec2 hold stage 1 residue A and its mod-inverse in packed-bit form, can use any of
 	pow[], mult[0-2][], a[] in our ensuing stage 2 inits and still re-obtain b and b^-1 from vec1,2 at any time. ***/
@@ -1502,19 +1502,19 @@ fprintf(stderr,"#1: vec2 = A^-1 checksums = %llu,%llu,%llu; FP(A^-1)[0:1] = %10.
 			// limb than p>>6, so shorten sub-vec-length by 1 to make sure any carry ends up in tmp, not vec2[nlimb-1]
 	// Sign of wraparound carry depends on modulus type:
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
-		ASSERT(HERE,tmp == 0ull,"Mersenne-mod vec1+vec2 should never spill over into next word!");
+		ASSERT(tmp == 0ull,"Mersenne-mod vec1+vec2 should never spill over into next word!");
 		// Now get the really carry bit, bit p. Normalized residue only uses bits <0:p-1>:
 		bit = p&63; word = p>>6; tmp = vec2[word]>>bit;
 		q = mi64_add_scalar(vec2, tmp, vec2, nlimb);
-		ASSERT(HERE,q == 0ull,"Mersenne-mod vec1+vec2 wraparound carry should never have carry-out!");
+		ASSERT(q == 0ull,"Mersenne-mod vec1+vec2 wraparound carry should never have carry-out!");
 	} else {
 		q = mi64_sub_scalar(vec2, tmp, vec2, nlimb-1);	// Again shorten vec-sub-length by 1 to properly check for borrow-out
-		ASSERT(HERE,q == 0ull,"Fermat-mod vec1+vec2 wraparound carry should never have borrow-out!");
+		ASSERT(q == 0ull,"Fermat-mod vec1+vec2 wraparound carry should never have borrow-out!");
 	}
 	convert_res_bytewise_FP((uint8*)vec2,buf[0], n, p);	// buf[0] = V[1]
 	// Now recover original vec2 = A^-1 from the FP version in a[]:
 	convert_res_FP_bytewise(a,(uint8*)vec2, n, p, &Res64, &Res35m1, &Res36m1);
-fprintf(stderr,"#2: vec2 = A^-1 checksums = %llu,%llu,%llu\n",Res64,Res35m1,Res36m1);
+fprintf(stderr,"#2: vec2 = A^-1 checksums = %" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",Res64,Res35m1,Res36m1);
    #else
 	// Original FP code for V[1] lacks post-add normalization:
 	for(i = 0; i < npad; i++) { buf[0][i] = pow[i] + a[i]; }	// V[1] = A^1 + A^-1
@@ -1677,7 +1677,7 @@ MME = 0;
 	} else {
 		tmp = vec1[nlimb-1];
 	}
-	ASSERT(HERE,tmp == 0ull,"Properly normalized residue should never spill over into next word!");
+	ASSERT(tmp == 0ull,"Properly normalized residue should never spill over into next word!");
    #endif	// PM1_DEBUG?
 
   #endif	// USE_PP1_MULTS?
@@ -1687,18 +1687,18 @@ MME = 0;
   #endif
 	if(nerr != 0) {
 		sprintf(cbuf,"Stage 2 buffer-init hit 1 or more fatal errors! Aborting.");
-		mlucas_fprint(cbuf,pm1_standlone+0);	ASSERT(HERE, 0,cbuf);
+		mlucas_fprint(cbuf,pm1_standlone+0);	ASSERT(0,cbuf);
 	}
 	if(i != m*num_b) {
 		sprintf(cbuf,"Stage 2: Incorrect loop-exit value of buffer-index!");
-		mlucas_fprint(cbuf,pm1_standlone+0);	ASSERT(HERE, 0,cbuf);
+		mlucas_fprint(cbuf,pm1_standlone+0);	ASSERT(0,cbuf);
 	}
 	// buf[] entries all need to be rest-of-fwd-FFTed;
 	for(i = 0; i < m*num_b; i++) {
 		// Since buf[0] holds pure-int copy of stage 1 residue A on loop entry, bit 0 of mode_flag = 0 for just it:
 		//                                                                    vvvvvvvv
 		ierr = func_mod_square(buf[i], 0x0, n, 0,1, 4ull + (uint64)(mode_flag - (i==0)), p, scrnFlag,&tdif2, FALSE, 0x0); nerr += ierr;
-	}	ASSERT(HERE, nerr == 0, "fwdFFT of buf[] entries returns error!");
+	}	ASSERT(nerr == 0, "fwdFFT of buf[] entries returns error!");
 
 	// Accumulate the cycle count in a floating double on each pass to avoid problems
 	// with integer overflow of the clock() result, if clock_t happens to be 32-bit int on the host platform:
@@ -1708,7 +1708,7 @@ MME = 0;
 	clock2 = getRealTime();
   #endif
 	*tdiff = clock2 - clock1; clock1 = clock2;
-	snprintf_nowarn(cbuf,STR_MAX_LEN, "Buffer-init done; clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME);
+	snprintf(cbuf,STR_MAX_LEN*2, "Buffer-init done; clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 
 	/********************* RESTART FILE STUFF: **********************/
@@ -1726,7 +1726,7 @@ MME = 0;
 		if(i && psmall)	{
 			// We expect the main-program S2-invocation code to have resolved this kind of mismatch via bigstep selection:
 			if(qlo >> 56)
-				ASSERT(HERE, (uint32)(qlo >> 56) == psmall, "Mismatch between relocation-prime set for stage 2 restart and one read from S2 savefile!");
+				ASSERT((uint32)(qlo >> 56) == psmall, "Mismatch between relocation-prime set for stage 2 restart and one read from S2 savefile!");
 		}
 		qlo &= 0x00ffffffffffffffull;	// Mask off high byte storing psmall
 		// If savefile-read fails, start stage 2 from B2_start:
@@ -1739,9 +1739,9 @@ MME = 0;
 			}
 			// If nsquares > B2_start, arrtmp holds the S2 interim residue for q = nsquares; set up to restart S2 at that point.
 			if(qlo >= B2_start) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "Read stage 2 savefile %s ... restarting stage 2 from q = %llu.\n",savefile,qlo);
+				snprintf(cbuf,STR_MAX_LEN*2, "Read stage 2 savefile %s ... restarting stage 2 from q = %" PRIu64 ".\n",savefile,qlo);
 			} else {	// If user running a new partial S2 interval with bounds larger than a previous S2 run, allow but info-print to that effect:
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "INFO: %s savefile has qlo[%llu] <= B2_start[%llu] ... Stage 2 interval will skip intervening primes.\n",func,qlo,B2_start);
+				snprintf(cbuf,STR_MAX_LEN*2, "INFO: %s savefile has qlo[%" PRIu64 "] <= B2_start[%" PRIu64 "] ... Stage 2 interval will skip intervening primes.\n",func,qlo,B2_start);
 			}
 			mlucas_fprint(cbuf,pm1_standlone+1);
 			restart = TRUE;
@@ -1758,7 +1758,7 @@ MME = 0;
 	if(!qlo) {			// If qlo unset, set = default stage 2 starting point ... if qlo already set via restart-file
 		qlo = B2_start;	// read, it will automatically be > our small-prime-relocation-reflecting value of B2_start.
 		if(psmall && B2_start > B1) {	// If psmall = 0, it's an S2 continuation run, no relocation done
-			sprintf(cbuf,"Small-prime[%u] relocation: will start Stage 2 at bound %llu\n",psmall,qlo);
+			sprintf(cbuf,"Small-prime[%u] relocation: will start Stage 2 at bound %" PRIu64 "\n",psmall,qlo);
 			mlucas_fprint(cbuf,pm1_standlone+1);
 		}
 	}
@@ -1774,9 +1774,9 @@ MME = 0;
 	k = k0 = q0/bigstep;	// Now set k to its 'real' value
 	if((uint64)k*bigstep != q0) {
 		sprintf(cbuf,"k must be 32-bit!");
-		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 	}
-	sprintf(cbuf,"Stage 2 q0 = %llu, k0 = %u\n",q0,k0);
+	sprintf(cbuf,"Stage 2 q0 = %" PRIu64 ", k0 = %u\n",q0,k0);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 	/*
 	Expanded-match-window scheme needs us to precompute singleton-prime-q's bitmap corresponding to M intervals
@@ -1804,7 +1804,7 @@ MME = 0;
 	We only start actual 0-interval and extended-window pairing when said interval has shifted to the middle
 	of the extended pairing window, i.e. is the 0-interval (M odd), or shifted just left of the map midpoint (M even):
 	*/
-	ASSERT(HERE, q0 > (m2+1)*(uint64)bigstep, "ERROR: qlo underflows in p-1 stage 2.");
+	ASSERT(q0 > (m2+1)*(uint64)bigstep, "ERROR: qlo underflows in p-1 stage 2.");
 	qlo = q0 - (m2+1)*(uint64)bigstep;
 	/*
 	[c] Our A^(a^2) values = A^((k*D)^2) and we'll be incrementing k between sweeps over the set of b's.
@@ -1813,7 +1813,7 @@ MME = 0;
 	*/
 	// At this point pow = A[stage 1 residue]; need either A^(D^2) or (A^D + A^-D), where D = bigstep:
 #ifndef PM1_STANDALONE
-	snprintf_nowarn(cbuf,STR_MAX_LEN, "Computing Stage 2 loop-multipliers...\n");	mlucas_fprint(cbuf,pm1_standlone+1);
+	snprintf(cbuf,STR_MAX_LEN*2, "Computing Stage 2 loop-multipliers...\n");	mlucas_fprint(cbuf,pm1_standlone+1);
 	MME = 0.0;	// Reset maxROE
 	// Raise A to power D^2, using mult[0] as a scratch array; again crap-API forces us to specify an "input is pure-int?" flag:
 	input_is_int = TRUE;
@@ -1894,9 +1894,9 @@ MME = 0;
 	// mult[0] = A^+D * A^-D, check that result = 1 as expected:
 	mode_flag = 1;	// bits 0:1 of mode_flag = 1,0, since mult[2] enters in fwd-FFT-pass-1-done form and want output in pure-int form
 	ierr += func_mod_square(mult[2], 0x0, n, 0,1, (uint64)a + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);
-	ASSERT(HERE, mult[2][0] == 1.0, "inverse-check fails!");
+	ASSERT(mult[2][0] == 1.0, "inverse-check fails!");
 	for(i = 1; i < npad; i++) {
-		ASSERT(HERE, mult[2][i] == 0.0, "inverse-check fails!");
+		ASSERT(mult[2][i] == 0.0, "inverse-check fails!");
 	}
 	fprintf(stderr,"A^-D inverse check passed ... exiting.\n");
 	exit(0);	// Since above debug overwrites mult[2], must quit.
@@ -1924,9 +1924,9 @@ MME = 0;
 		tmp = vec2[nlimb-1];
 		q = mi64_sub_scalar(vec2, tmp, vec2, nlimb-1);	// Again shorten vec-sub-length by 1 to properly check for borrow-out
 	}
-	ASSERT(HERE,q == 0ull,"Properly normalized vec1+vec2 wraparound carry should never have borrow-out!");
+	ASSERT(q == 0ull,"Properly normalized vec1+vec2 wraparound carry should never have borrow-out!");
 	// Now compare to alternate-path V[D] computed in above buffer-init code:
-	ASSERT(HERE, mi64_cmp_eq(vec1,vec2,nlimb), "V[D] results mismatch!");
+	ASSERT(mi64_cmp_eq(vec1,vec2,nlimb), "V[D] results mismatch!");
 	fprintf(stderr,"V[D] cross-check passed ... exiting.\n");
 	exit(0);	// Since above debug overwrites a[], must quit.
    #endif	// PM1_DEBUG?
@@ -1942,9 +1942,9 @@ MME = 0;
 	// pow[] = A^+((k0-1)*D) * A^-((k0-1)*D), check that result = 1 as expected:
 	mode_flag = 1;	// bits 0:1 of mode_flag = 1,0, since pow[] enters in fwd-FFT-pass-1-done form and want output in pure-int form
 	ierr += func_mod_square(pow, 0x0, n, 0,1, (uint64)mult[1] + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);
-	ASSERT(HERE, pow[0] == 1.0, "A^-(k0-1)*D inverse-check fails!");
+	ASSERT(pow[0] == 1.0, "A^-(k0-1)*D inverse-check fails!");
 	for(i = 1; i < npad; i++) {
-		ASSERT(HERE, pow[i] == 0.0, "A^-(k0-1)*D inverse-check fails!");
+		ASSERT(pow[i] == 0.0, "A^-(k0-1)*D inverse-check fails!");
 	}
 	fprintf(stderr,"A^-(k0-1)*D inverse check passed ... exiting.\n");
 	exit(0);	// Since above debug overwrites pow[], must quit.
@@ -1973,9 +1973,9 @@ MME = 0;
 	ierr += func_mod_square(mult[0], 0x0, n, 0,1, 4ull + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);
 	mode_flag = 1;	// bits 0:1 of mode_flag = 1,0, since pow[] enters in fwd-FFT-pass-1-done form and want output in pure-int form
 	ierr += func_mod_square(pow, 0x0, n, 0,1, (uint64)mult[0] + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);
-	ASSERT(HERE, pow[0] == 1.0, "A^-k0*D inverse-check fails!");
+	ASSERT(pow[0] == 1.0, "A^-k0*D inverse-check fails!");
 	for(i = 1; i < npad; i++) {
-		ASSERT(HERE, pow[i] == 0.0, "A^-k0*D inverse-check fails!");
+		ASSERT(pow[i] == 0.0, "A^-k0*D inverse-check fails!");
 	}
 	fprintf(stderr,"A^-k0*D inverse check passed ... exiting.\n");
 	exit(0);	// Since above debug overwrites pow[], must quit.
@@ -1990,8 +1990,8 @@ MME = 0;
 
 	if(restart) {	// If restart, convert bytewise-residue S2 accumulator read from file to floating-point form:
 		if(!convert_res_bytewise_FP((uint8*)arrtmp, pow, n, p)) {
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",savefile);
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+			snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",savefile);
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 		}
 		// Restart-file-read S2 interim residue in pow[] needs fwd-weight and FFT-pass1-done:
 		ierr = func_mod_square(pow, 0x0, n, 0,1, -4ull, p, scrnFlag,&tdif2, FALSE, 0x0);
@@ -2003,8 +2003,8 @@ MME = 0;
 	   #if 0	// A: No, because pow = A^(k0*D) + A^-(k0*D) is perfectly fine as S2 init-accumulator
 		vec1[nlimb-1] = 0ull;
 		if(!convert_res_bytewise_FP((uint8*)vec1, pow, n, p)) {
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on S1 residue in vec1!\n");
-			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+			snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on S1 residue in vec1!\n");
+			mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 		}
 		// Pure-int S1 residue in pow[] needs fwd-weight and FFT-pass1-done:
 		ierr = func_mod_square(pow, 0x0, n, 0,1, -4ull, p, scrnFlag,&tdif2, FALSE, 0x0);
@@ -2021,7 +2021,7 @@ MME = 0;
 	ierr = func_mod_square(mult[2], 0x0, n, 0,1, 4ull + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); nerr += ierr;
 	if(nerr != 0) {
 		sprintf(cbuf,"Stage 2 loop-multipliers computation hit one or more fatal errors! Aborting.");
-		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+		mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 	}
   #ifdef CTIME
 	clock2 = clock();
@@ -2029,7 +2029,7 @@ MME = 0;
 	clock2 = getRealTime();
   #endif
 	*tdiff = clock2 - clock1; clock1 = clock2;
-	snprintf_nowarn(cbuf,STR_MAX_LEN, "Stage 2 loop-multipliers: clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME);
+	snprintf(cbuf,STR_MAX_LEN*2, "Stage 2 loop-multipliers: clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 	*tdiff = AME = MME = 0.0;	// Reset timer and maxROE, now also init AvgROE
 	AME_ITER_START = 0;	// For p-1 stage 2, start collecting AvgROE data immediately, no need t wait for residue to "fill in"
@@ -2050,7 +2050,7 @@ MME = 0;
 	{
 		if(!reloc_on && q >= reloc_start) {	// Start including relocation-semiprimes once S@ passes this point
 			reloc_on = TRUE;
-			sprintf(cbuf,"Hit q = %llu >= reloc_start[%llu] ... enabling small-prime relocation.\n",q,reloc_start);
+			sprintf(cbuf,"Hit q = %" PRIu64 " >= reloc_start[%" PRIu64 "] ... enabling small-prime relocation.\n",q,reloc_start);
 			mlucas_fprint(cbuf,pm1_standlone+1);
 		}
 		// Only start actual 0-interval and extended-window pairing when q hits q0:
@@ -2061,7 +2061,7 @@ MME = 0;
 			center of the expanded-match window, recompute same batch of stage 2 powering pairs q1,q2[i] = k*D +- b[i]
 			and process the both-q1-and-q2-prime pairs: */
 		#ifdef PM1_DEBUG
-			fprintf(stderr,"k = %u: q = %llu\n",k,q);
+			fprintf(stderr,"k = %u: q = %" PRIu64 "\n",k,q);
 			fprintf(stderr,"Processing 0-interval prime pairs:\n");
 		#endif
 			map_lo = map + m2*wsize + (wsize>>1);	// ptr to midpoint of 0-interval map word
@@ -2071,9 +2071,9 @@ MME = 0;
 			#ifdef PM1_DEBUG
 				q1 = q - b[i]; q2 = q + b[i];
 				bit = pprimeF64(q1,2ull); if(p1 != bit)
-					fprintf(stderr,"Mismatch: q1 = %llu[%u], bytevec_test_bit returns %u\n",q1,bit,p1);
+					fprintf(stderr,"Mismatch: q1 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q1,bit,p1);
 				bit = pprimeF64(q2,2ull); if(p2 != bit)
-					fprintf(stderr,"Mismatch: q2 = %llu[%u], bytevec_test_bit returns %u\n",q2,bit,p2);
+					fprintf(stderr,"Mismatch: q2 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q2,bit,p2);
 			#endif
 				// Skip a given value of i if one or both of q1,q2[i] are composite according to a 2-prp test:
 				j = p1+p2;
@@ -2083,12 +2083,12 @@ MME = 0;
 					if(j < m)
 						continue;
 				#ifdef PM1_DEBUG
-					fprintf(stderr,"\tq1 = %llu[%u], q2 = %llu[%u], 1-prime\n",q1,p1,q2,p2);
+					fprintf(stderr,"\tq1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], 1-prime\n",q1,p1,q2,p2);
 				#endif
 					ns++;
 				} else {
 				#ifdef PM1_DEBUG
-					fprintf(stderr,"\tq1 = %llu[%u], q2 = %llu[%u], both prime\n",q1,p1,q2,p2);
+					fprintf(stderr,"\tq1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], both prime\n",q1,p1,q2,p2);
 				#endif
 					np++;
 				}
@@ -2143,16 +2143,16 @@ MME = 0;
 					p1 = bytevec_test_bit(map_lo,j); p2 = bytevec_test_bit(map_hi,j);
 					q1 = q - b[tmp+j]; q2 = q + b[tmp+j];
 					bit = pprimeF64(q1,2ull); if(p1 != bit)
-						fprintf(stderr,"Mismatch: q1 = %llu[%u], bytevec_test_bit returns %u\n",q1,bit,p1);
+						fprintf(stderr,"Mismatch: q1 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q1,bit,p1);
 					bit = pprimeF64(q2,2ull); if(p2 != bit)
-						fprintf(stderr,"Mismatch: q2 = %llu[%u], bytevec_test_bit returns %u\n",q2,p2,bit);
+						fprintf(stderr,"Mismatch: q2 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q2,p2,bit);
 				#endif
 					// For thus-paired prime-q's, update stage 2 accumulator:
 					p1 = bytevec_test_bit(lo,j);
 					if(p1) {
 						np++;
 					#ifdef PM1_DEBUG
-						fprintf(stderr,"\tq = %llu -+ %u: q1 = %llu[%u], q2 = %llu[%u], paired singles\n",q,b[tmp+j],q-b[tmp+j],p1,q+b[tmp+j],p2);
+						fprintf(stderr,"\tq = %" PRIu64 " -+ %u: q1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], paired singles\n",q,b[tmp+j],q-b[tmp+j],p1,q+b[tmp+j],p2);
 					#endif
 					#ifndef PM1_STANDALONE
 					 #ifdef USE_VEC_DBL_SUB
@@ -2194,7 +2194,7 @@ MME = 0;
 					#ifdef PM1_DEBUG
 						q1 = q-b[tmp+i]; q2 = q+b[tmp+i];
 						p1 = pprimeF64(q1,2ull); p2 = pprimeF64(q2,2ull);	// Run q1,q2 through a base-2 Fermat-composite test
-						fprintf(stderr,"\tq = %llu -+ %u: q1 = %llu[%u], q2 = %llu[%u], 1-prime\n",q,b[tmp+i],q1,p1,q2,p2);
+						fprintf(stderr,"\tq = %" PRIu64 " -+ %u: q1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], 1-prime\n",q,b[tmp+i],q1,p1,q2,p2);
 					#endif
 					#ifndef PM1_STANDALONE
 					 #ifdef USE_VEC_DBL_SUB
@@ -2247,7 +2247,7 @@ MME = 0;
 			Working leftward from right endpoint of interval, init single 2*num_b-bit accumulator.
 		*/
 	#ifdef PM1_DEBUG
-		fprintf(stderr,"New upper-interval with q0 = %llu, tagging its primes:\n",tmp);
+		fprintf(stderr,"New upper-interval with q0 = %" PRIu64 ", tagging its primes:\n",tmp);
 	#endif
 	/*
 		Prime relocation: Illustrate using psmall = 11, but analogous pattering holds for psmall = 7 (D = 330|660):
@@ -2309,13 +2309,13 @@ MME = 0;
 					if(bytevec_test_bit(rmap,j      )) {
 						q1 *= pinv64;
 					#ifdef PM1_DEBUG
-						fprintf(stderr,"reloc q1: %llu => %llu\n",q1,q1*psmall);
+						fprintf(stderr,"reloc q1: %" PRIu64 " => %" PRIu64 "\n",q1,q1*psmall);
 					#endif
 					}
 					if(bytevec_test_bit(rmap,i+num_b)) {
 						q2 *= pinv64;
 					#ifdef PM1_DEBUG
-						fprintf(stderr,"reloc q2: %llu => %llu\n",q2,q2*psmall);
+						fprintf(stderr,"reloc q2: %" PRIu64 " => %" PRIu64 "\n",q2,q2*psmall);
 					#endif
 					}
 				}
@@ -2336,7 +2336,7 @@ MME = 0;
 				if(psmall && bytevec_test_bit(rmap,j)) {
 					q1 *= pinv64;
 				#ifdef PM1_DEBUG
-					fprintf(stderr,"reloc q: %llu => %llu\n",q1,q1*psmall);
+					fprintf(stderr,"reloc q: %" PRIu64 " => %" PRIu64 "\n",q1,q1*psmall);
 				#endif
 				}
 				p1 = pprimeF64(q1,2ull);
@@ -2446,7 +2446,7 @@ MME = 0;
 			strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S",local_time);
 			AME /= (nmodmul - nmodmul_save);
 			// Print [date in hh:mm:ss | p | stage progress | %-complete | time | per-iter time | Res64 | max ROE:
-			snprintf_nowarn(cbuf,STR_MAX_LEN, "[%s] %s %s = %llu [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016llX. AvgMaxErr = %10.9f. MaxErr = %10.9f.\n"
+			snprintf(cbuf,STR_MAX_LEN*2, "[%s] %s %s = %" PRIu64 " [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016" PRIX64 ". AvgMaxErr = %10.9f. MaxErr = %10.9f.\n"
 				, timebuffer, PSTRING, "S2 at q", q+bigstep, (float)(q-B2_start)/(float)(B2-B2_start) * 100,get_time_str(*tdiff)
 				, 1000*get_time(*tdiff)/(nmodmul - nmodmul_save), Res64, AME, MME);
 			mlucas_fprint(cbuf,pm1_standlone+scrnFlag);
@@ -2458,8 +2458,8 @@ MME = 0;
 				write_ppm1_savefiles(savefile,p,n,fp, ((uint64)psmall<<56) + q + bigstep, (uint8*)arrtmp,Res64,Res35m1,Res36m1, 0x0,0x0,0x0,0x0);
 				fclose(fp);	fp = 0x0;
 			} else {
-				snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open restart file %s for write of checkpoint data.\n",savefile);
-				mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(HERE, 0,cbuf);
+				snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open restart file %s for write of checkpoint data.\n",savefile);
+				mlucas_fprint(cbuf,pm1_standlone+1);	ASSERT(0,cbuf);
 			}
 			// If interim-GCDs enabled (default) and latest S2 interval crossed a 10M mark, take a GCD; if factor found, early-return;
 			if(interim_gcd) {
@@ -2481,7 +2481,7 @@ MME = 0;
 		}
 	#endif	// #ifndef PM1_STANDALONE
 	}	// endfor(q = qlo; q < qhi; q += bigstep)
-	ASSERT(HERE, nerr == 0, "Stage 2 loop hit a modmul error!");
+	ASSERT(nerr == 0, "Stage 2 loop hit a modmul error!");
 #ifndef PM1_STANDALONE
 	// Need to undo pass 1 of fwd-FFT on loop-exit; do this just as with fwd-FFT-only, but with flag = 8 instead of 4:
 	ierr = func_mod_square(pow, 0x0, n, 0,1, 8ull, p, scrnFlag,&tdif2, FALSE, 0x0);
@@ -2491,40 +2491,40 @@ MME = 0;
 #endif
 	// (k - k0) = #bigstep-blocks (passes thru above loop) used in stage 2; np + ns + 2*(k - k0) = #modmul:
 	nmodmul = np + ns + 2*(k - k0);	// This is actually redundant, but just to spell it out
-	snprintf(cbuf,STR_MAX_LEN,"M = %2u: #buf = %4u, #pairs: %u, #single: %u (%5.2f%% paired), #blocks: %u, #modmul: %u\n",m,m*num_b,np,ns,100.0*2*np/(2*np+ns),k-k0,nmodmul);
+	snprintf(cbuf,STR_MAX_LEN*2,"M = %2u: #buf = %4u, #pairs: %u, #single: %u (%5.2f%% paired), #blocks: %u, #modmul: %u\n",m,m*num_b,np,ns,100.0*2*np/(2*np+ns),k-k0,nmodmul);
 	mlucas_fprint(cbuf,pm1_standlone+1);
 #ifndef PM1_STANDALONE
 
   #ifdef PM1_DEBUG
   #warning Revert this preprocessor flag!
-	fprintf(stderr,"Res64 = 0x%016llX; clocks =%s, MaxErr = %10.9f\n",arrtmp[0],get_time_str(*tdiff),MME);
+	fprintf(stderr,"Res64 = %#016" PRIX64 "; clocks =%s, MaxErr = %10.9f\n",arrtmp[0],get_time_str(*tdiff),MME);
   if(p == 33554432) {  // F25: check if the known factor divides the S2 result:
-	ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!");
+	ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!");
 	int isfact = mi64_is_div_by_scalar64(arrtmp,2170072644496392193ull,nlimb);	// k = 2^5.3^2.37.997.11066599
-	ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!");
+	ASSERT(isfact != 0, "Failed to find known stage 2 factor!");
 	fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING);
   }
   if(p == 108268067) {
-	ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!");
+	ASSERT(MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!");
 	uint64 rem[2] = {0ull,0ull}, q[2] = {11943519037290122063ull,18561975ull};	// k = 7.17.19.61.294313.38955941; q = k.2^p + 1
 	int isfact = mi64_div(arrtmp,q, nlimb,2, 0x0, rem);
-	ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!");
+	ASSERT(isfact != 0, "Failed to find known stage 2 factor!");
 	fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING);
   }
   if(p == 2147483648) {  // F31: check if the known factor divides the S2 result:
-	ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!");
+	ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!");
 	uint64 rem[2] = {0ull,0ull}, q[2] = {3118754346955702273ull,2544ull};	// k = 3.13.140091319777; q = k.2^(m+2) + 1
 	int isfact = mi64_div(arrtmp,q, nlimb,2, 0x0, rem);
-	ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!");
+	ASSERT(isfact != 0, "Failed to find known stage 2 factor!");
 	fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING);
   }
   #endif	// PM1_DEBUG
 
 	// In case of normal (non-early) return, caller will handle the GCD:
 	if(strlen(gcd_str)) {
-		snprintf_nowarn(cbuf,STR_MAX_LEN, "Stage 2 early-return due to factor found; MaxErr = %10.9f.\n",MME);
+		snprintf(cbuf,STR_MAX_LEN*2, "Stage 2 early-return due to factor found; MaxErr = %10.9f.\n",MME);
 	} else {
-		snprintf_nowarn(cbuf,STR_MAX_LEN, "Stage 2 done; MaxErr = %10.9f. Taking GCD...\n",MME);
+		snprintf(cbuf,STR_MAX_LEN*2, "Stage 2 done; MaxErr = %10.9f. Taking GCD...\n",MME);
 	}
 	mlucas_fprint(cbuf,pm1_standlone+scrnFlag);
 #endif
@@ -2572,7 +2572,7 @@ MME = 0;
 
 	//	while(tpool->tasks_queue.num_tasks != 0) {	//*** not safe, since can have #tasks == 0 with some tasks still in flight ***
 		while(tpool->free_tasks_queue.num_tasks != NTHREADS) {
-			ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!");
+			ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!");
 		//	printf("sleep; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 		}
 	//	printf("end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
diff --git a/src/qfcheb.h b/src/qfcheb.h
index 6a8a23be..78a554ca 100755
--- a/src/qfcheb.h
+++ b/src/qfcheb.h
@@ -1,2 +1,2 @@
 #define STR_MAX_LEN 1024
-extern char cbuf[STR_MAX_LEN];
+extern char cbuf[STR_MAX_LEN*2];
diff --git a/src/qfloat.c b/src/qfloat.c
index ca243c76..bc171f2f 100755
--- a/src/qfloat.c
+++ b/src/qfloat.c
@@ -308,7 +308,7 @@ uint32 qfcmplt(struct qfloat q1, struct qfloat q2)
 	case(3) :	/* Both q1 and q2 negative, in which case a more-negative q1 looks larger w.r.to the unsigned compare */
 		return (q1.hi > q2.hi || (q1.hi == q2.hi && q1.lo > q2.lo));
 	default:
-		ASSERT(HERE, 0,"ERROR 98 in qfloat.c");
+		ASSERT(0,"ERROR 98 in qfloat.c");
 	}
 	return 0;	/* Just to get the compiler to shut up ... this should never be reached. */
 }
@@ -398,9 +398,9 @@ long double qfldbl(struct qfloat x)
 	long double ld;
 	uint64 *ld_ptr = (uint64 *)&ld, nonhidden;
 	int32 exp = (int32)((x.hi & ~MASK_SIGN)>>52);
-	ASSERT(HERE, sizeof(long double) == 16, "QFLDBL assumes 16-byte long double type!");
+	ASSERT(sizeof(long double) == 16, "QFLDBL assumes 16-byte long double type!");
 	// Denormal check:
-	ASSERT(HERE, (exp != 0) && (exp != 0x7ff), "QFLDBL requires normal input!");
+	ASSERT((exp != 0) && (exp != 0x7ff), "QFLDBL requires normal input!");
 	exp -= (int32)0x400;	// x87 80-bit reg-format has 4 more bits in exp, centered around 0x4000 rather than 0x400
 	nonhidden = ((x.hi & MASK_MANT)<<11) + (x.lo>>53) + ((x.lo>>52)&0x1);
 	// Rounding of the off-shifted portion may cause nonhidden-bit summation to overflow into sign bit:
@@ -420,12 +420,12 @@ struct qfloat ldbl_to_q(long double x)
 	long double ld = x;
 	uint64 *ld_ptr = (uint64 *)&ld, x87_mant, x87_sexp;	// Note high 48 bits of x87_sexp are uninited
 	int32 exp;
-	DBG_ASSERT(HERE, sizeof(long double) == 16, "LDBL_TO_Q assumes 16-byte long double type!");
+	DBG_ASSERT(sizeof(long double) == 16, "LDBL_TO_Q assumes 16-byte long double type!");
 	x87_mant = *ld_ptr; x87_sexp = *(ld_ptr+1);
 	if(!x87_mant) return QZRO;
 	// Denormal check:
 	exp = (int32)(((x87_sexp<<48) & ~MASK_SIGN)>>48) - (int32)0x4000;
-	ASSERT(HERE, ABS(exp) <= 0x3ff, "LDBL_TO_Q requires double-compatible normal input!");
+	ASSERT(ABS(exp) <= 0x3ff, "LDBL_TO_Q requires double-compatible normal input!");
 	q.hi = ((x87_sexp<<48) & MASK_SIGN) + ((uint64)((int32)0x400 + exp)<<52) + ((x87_mant>>11) & MASK_MANT);
 	q.lo = (x87_mant<<53);
 	return q;
@@ -508,7 +508,7 @@ struct qfloat i128_to_q(uint128 i)
 		q.hi = sexp + (i.d1 >> rshift);
 		offword = (i.d0 << lshift) >> 63;	// MSB of off-shifted portion
 		q.lo = (i.d1 << lshift) + (i.d0 >> rshift) + offword;
-		ASSERT(HERE, q.lo >= offword, "Ripple-carry!");	// For now, just check for ripple-carry. Proper handling will come later.
+		ASSERT(q.lo >= offword, "Ripple-carry!");	// For now, just check for ripple-carry. Proper handling will come later.
 	}
 	else	/* need to left-shift mantissa */
 	{
@@ -547,7 +547,7 @@ struct qfloat qfmul_pow2(struct qfloat q, int32 pow)
 			qt.hi += sgn;		// Restore sign
 		}
 	} else if(exp1 >> 11) {	// Overflow: exp+pow carried into sign-bit slpt:
-		ASSERT(HERE, 0,"OVERFLOW in qfmul_pow2");
+		ASSERT(0,"OVERFLOW in qfmul_pow2");
 	} else {	// Result is normal
 		if(exp0) {
 			// If normal input, update exponent field and return:
@@ -560,11 +560,11 @@ struct qfloat qfmul_pow2(struct qfloat q, int32 pow)
 				lz = QLEADZ(qt) - 11;	// Number of leading zero bits in denormalized mantissa (i.e. shift count needed to move leading bit into hidden-bit location)
 				if(pow > lz) {	// Result will be normal
 					QLSHIFT(qt, lz, qt);
-					ASSERT(HERE, (qt.hi>>52) == 1, "Bad mantissa left-shift count in qfmul_pow2!");
+					ASSERT((qt.hi>>52) == 1, "Bad mantissa left-shift count in qfmul_pow2!");
 					qt.hi += (((uint64)pow-lz)<<52) - TWOE52;	// Don't fold -TWOE52 in via (pow-lz-1)<<52, since may have pow = lz here.
 				} else {	// Result still denormal
 					QLSHIFT(qt, pow, qt);
-					ASSERT(HERE, (qt.hi>>52) == 0, "Bad mantissa left-shift count in qfmul_pow2!");
+					ASSERT((qt.hi>>52) == 0, "Bad mantissa left-shift count in qfmul_pow2!");
 				}
 			}
 			qt.hi += sgn;		// Restore sign
@@ -584,7 +584,7 @@ uint128 qfnint(struct qfloat q)
 	uint64 offword, carry;
 	uint128 i;
 	i.d1 = q.hi; i.d0 = q.lo;
-	ASSERT(HERE, qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!");
+	ASSERT(qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!");
 	/* Separate upper part of the significand from the sign/exponent fields: */
 	sign = (int32)(i.d1 >> 63);
 	exp  = (int32)(i.d1 >> 52) & MASK_EXP;
@@ -667,7 +667,7 @@ uint128 qfint(struct qfloat q)
 	int32 exp, sign, rshift, lshift;
 	uint128 i;
 	i.d1 = q.hi; i.d0 = q.lo;
-	ASSERT(HERE, qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!");
+	ASSERT(qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!");
 
 	/* Separate upper part of the significand from the sign/exponent fields: */
 	sign = (int32)(i.d1 >> 63);
@@ -692,7 +692,7 @@ uint128 qfint(struct qfloat q)
 		{
 			if(rshift == -11 && (!sign || (i.d1 << -rshift) != MASK_SIGN || i.d0 != (uint64)0))
 			{
-				ASSERT(HERE, 0,"ERROR: qfloat is too large to convert to 128-bit integer.");
+				ASSERT(0,"ERROR: qfloat is too large to convert to 128-bit integer.");
 			}
 		}
 		lshift =     - rshift;
@@ -859,7 +859,7 @@ char* qf2str(struct qfloat q)
 			--pow10;
 		}
 	}
-	ASSERT(HERE, mi64_getlen(u,len) == 17 && u[16] && u[16] < 10, "QF2STRING: Normalization error!");
+	ASSERT(mi64_getlen(u,len) == 17 && u[16] && u[16] < 10, "QF2STRING: Normalization error!");
 	os[1] = u[16] + '0';	// Put MSD to left of decimal point
 	os[2] = '.';
 	for(i = 3; i < 38; ++i) {
@@ -981,7 +981,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 	else
 	{
 		printf("Multiply by denormalized operand not supported!");
-	//	ASSERT(HERE, 0,"ERROR in qfloat.c : qfmul");
+	//	ASSERT(0,"ERROR in qfloat.c : qfmul");
 		return QZRO;
 	}
 
@@ -992,7 +992,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 	else
 	{
 		printf("Multiply by denormalized operand not supported!");
-	//	ASSERT(HERE, 0,"ERROR in qfloat.c : qfmul");
+	//	ASSERT(0,"ERROR in qfloat.c : qfmul");
 		return QZRO;
 	}
 
@@ -1005,7 +1005,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 	if( ((sexp1 & ~MASK_SIGN) + (sexp2 & ~MASK_SIGN)) < 0x4000000000000000ull) {
 	#if QFDEBUG
 		WARN(HERE, "DENORM result in QFMUL ... flushing to 0.\n", "", 0);
-		ASSERT(HERE, fabs(qfdbl(q1)*qfdbl(q2)) < 1e-300, "Incorrect DENORM result in QFMUL!");
+		ASSERT(fabs(qfdbl(q1)*qfdbl(q2)) < 1e-300, "Incorrect DENORM result in QFMUL!");
 	#endif
 		return QZRO;
 	}
@@ -1046,8 +1046,8 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 
   /* DEBUG: make sure we didn't lose any bits of b or d during the conversion to float. */
   #if QFDEBUG
-	if(!((uint64)db == b)) ASSERT(HERE, 0,"ERROR 120 in qfloat.c");
-	if(!((uint64)dd == d)) ASSERT(HERE, 0,"ERROR 121 in qfloat.c");
+	if(!((uint64)db == b)) ASSERT(0,"ERROR 120 in qfloat.c");
+	if(!((uint64)dd == d)) ASSERT(0,"ERROR 121 in qfloat.c");
   #endif
 
 	adhi = da*dd;
@@ -1056,7 +1056,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 #endif	// USE_FMUL_FOR_LOW_WORD
 
 	bvac = (uint64)leadz64(hi);
-	ASSERT(HERE, (bvac < 2), "ERROR 130 in qfloat.c");	/* Make sure at most the leftmost bit of high part is vacant. This check
+	ASSERT((bvac < 2), "ERROR 130 in qfloat.c");	/* Make sure at most the leftmost bit of high part is vacant. This check
 						needs to remain in place until support for denormalized oprands is added. */
 	/*
 	Now need to right-shift MUL_LOHI result (12-bvac) places, FMUL results (1-bvac) place, and add together.
@@ -1068,12 +1068,12 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 	/*
 	lo = (hi << 52) + (lo >> 12) + (lo & 1) + (uint64)(adhi + bchi);
  	hi = (hi >> 12);
-	printf("mul_hi = %16llX = %20llu\n", hi, hi);
-	printf("mul_lo = %16llX = %20llu\n", lo, lo);
+	printf("mul_hi = %16" PRIX64 " = %20" PRIu64 "\n", hi, hi);
+	printf("mul_lo = %16" PRIX64 " = %20" PRIu64 "\n", lo, lo);
 	*/
 
 	return_val.hi = (hi >> (11-bvac));
-	ASSERT(HERE, (return_val.hi >> 52) == 1, "ERROR 140 in qfloat.c");
+	ASSERT((return_val.hi >> 52) == 1, "ERROR 140 in qfloat.c");
 	return_val.lo = (hi << (53+bvac)) + (lo >> (11-bvac)) + ((lo >> (10-bvac)) & (uint64)1) + (((uint64)adhi + (uint64)bchi) << bvac);
                                                             /* ^^^^rounding is here^^^^^ */   /* Maximize accuracy by converting to int prior to add. */
 	if(return_val.lo < (hi << (53+bvac)))	/* had a carry out of lo part. */
@@ -1101,7 +1101,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFMUL!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFMUL!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFMUL!\n");
 	}
 #endif
 
@@ -1120,7 +1120,7 @@ struct qfloat qfinc(struct qfloat x)
 		return qfneg(qfdec(q));	// For x < 0, Use x + 1 = 1 - |x| = -(|x| - 1)
 	} else {
 		// Check for denormal (over and underflow):
-		ASSERT(HERE, ((q.hi>>52) + 1) >= 2, "Denormal not supported!");
+		ASSERT(((q.hi>>52) + 1) >= 2, "Denormal not supported!");
 
 		if(q.hi < QONE.hi) {
 		// This is just the significand-add section of qfsum with the following argument value specializations:
@@ -1228,14 +1228,14 @@ struct qfloat qfinc(struct qfloat x)
 		}
 	}
 #if QFDEBUG
-	ASSERT(HERE, qfcmpeq(q, qfadd(x,QONE)), "qfinc fails!");
+	ASSERT(qfcmpeq(q, qfadd(x,QONE)), "qfinc fails!");
 #endif
 	return q;
 }
 
 struct qfloat qfdec(struct qfloat q)
 {
-	ASSERT(HERE, 0, "qfdec not supported yet!");
+	ASSERT(0, "qfdec not supported yet!");
 	return qfsub(q, QONE);
 }
 
@@ -1272,7 +1272,7 @@ struct qfloat qfadd	(struct qfloat q1, struct qfloat q2)
 	}
 	else
 	{
-		ASSERT(HERE, 0,"ERROR: unrecognized sign combination in QFADD");
+		ASSERT(0,"ERROR: unrecognized sign combination in QFADD");
 	}
 #if QFDEBUG
 	double qres = qfdbl(q), dres = (1-2.0*sgn1)*qfdbl(q1) + (1-2.0*sgn2)*qfdbl(q2);	// Must cast sgn1,2 to double prior to 1-...
@@ -1282,7 +1282,7 @@ struct qfloat qfadd	(struct qfloat q1, struct qfloat q2)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFADD!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFADD!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFADD!\n");
 	}
 #endif
 
@@ -1316,7 +1316,7 @@ struct qfloat qfsub	(struct qfloat q1, struct qfloat q2)
 	}
 	else
 	{
-		ASSERT(HERE, 0,"ERROR: unrecognized sign combination in QFSUB");
+		ASSERT(0,"ERROR: unrecognized sign combination in QFSUB");
 	}
 #if QFDEBUG
 	double qres = qfdbl(q), dres = (1-2.0*sgn1)*qfdbl(q1) - (1-2.0*sgn2)*qfdbl(q2);	// Must cast sgn1,2 to double prior to 1-...
@@ -1326,7 +1326,7 @@ struct qfloat qfsub	(struct qfloat q1, struct qfloat q2)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFSUB!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFSUB!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFSUB!\n");
 	}
 #endif
 
@@ -1368,7 +1368,7 @@ struct qfloat qfsum(struct qfloat q1, struct qfloat q2)
 	uint64 exp0, exp1, hi0, hi1, lo0, lo1, offword;
 
 	/* Make sure both inputs are nonnegative. */
-	DBG_ASSERT(HERE, ((int64)q1.hi >= 0 && (int64)q2.hi >= 0),"ERROR 160 in qfloat.c");
+	DBG_ASSERT(((int64)q1.hi >= 0 && (int64)q2.hi >= 0),"ERROR 160 in qfloat.c");
 
 	/* Larger of the two operands gets index 0 in our local length-2 arrays: */
 	if(qfcmple(q2, q1))
@@ -1481,7 +1481,7 @@ struct qfloat qfsum(struct qfloat q1, struct qfloat q2)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFSUM!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFSUM!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFSUM!\n");
 	}
 #endif
 	return(return_val);
@@ -1510,7 +1510,7 @@ struct qfloat qfdif(struct qfloat q1, struct qfloat q2)
 	uint64 exp0, exp1, hi0, hi1, lo0, lo1, offword;
 
 	/* Make sure both inputs are nonnegative. */
-	DBG_ASSERT(HERE, ((int64)q1.hi >= 0) && ((int64)q2.hi >= 0),"ERROR 170 in qfloat.c");
+	DBG_ASSERT(((int64)q1.hi >= 0) && ((int64)q2.hi >= 0),"ERROR 170 in qfloat.c");
 
 	/* Larger of the two operands gets index 0 in our local length-2 arrays: */
 	if(qfcmple(q2, q1))
@@ -1635,7 +1635,7 @@ struct qfloat qfdif(struct qfloat q1, struct qfloat q2)
 		else	/* Hi part is zero. Assuming lo part has lzlo lead zeros, right-shift lo (11-lzlo) bits and put that into hi. */
 		{
 		#if QFDEBUG
-			printf("WARNING: catastrophic loss of precision in subtract:\n %16llX %16llX -\n %16llX %16llX\n", ptr0->hi, ptr0->lo, ptr1->hi, ptr1->lo);
+			printf("WARNING: catastrophic loss of precision in subtract:\n %16" PRIX64 " %16" PRIX64 " -\n %16" PRIX64 " %16" PRIX64 "\n", ptr0->hi, ptr0->lo, ptr1->hi, ptr1->lo);
 		#endif
 		//	return QZRO; *** Taking the easy way out breaks older already-tested stuff in qtest() ***
 			if((int32)rshift > 0)	/* Lo part has > 53 SB, upper 53 of which get put into hi part. */
@@ -1673,7 +1673,7 @@ struct qfloat qfdif(struct qfloat q1, struct qfloat q2)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFDIF!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFDIF!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFDIF!\n");
 	}
 #endif
 
@@ -1709,7 +1709,7 @@ struct qfloat qfinv(struct qfloat x)
 	/* Make sure x is properly normalized. This also catches potential divides-by-zero. */
 	if((x.hi & ~(MASK_SIGN + MASK_MANT)) == (uint64)0)
 	{
-		ASSERT(HERE, 0,"ERROR: divide by denormalized input not supported.");
+		ASSERT(0,"ERROR: divide by denormalized input not supported.");
 	}
 #ifdef X87_ASM
 	ld = qfldbl(x);
@@ -1750,7 +1750,7 @@ struct qfloat qfinv(struct qfloat x)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFINV!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFINV!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFINV!\n");
 	}
 #endif
 
@@ -1773,7 +1773,7 @@ struct qfloat qfdiv(struct qfloat q1, struct qfloat q2)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFDIV!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFDIV!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFDIV!\n");
 	}
 #endif
 	return qinv;
@@ -1814,7 +1814,7 @@ struct qfloat qfsqrt(struct qfloat x)
 #endif
 
 	/* Make sure x is nonnegative. This also catches potential divides-by-zero. */
-	ASSERT(HERE, !(x.hi >> 63),"ERROR: sqrt of a negative number not supported.");
+	ASSERT(!(x.hi >> 63),"ERROR: sqrt of a negative number not supported.");
 	if(qfcmpeq(x, QZRO)) return QZRO;
 #ifdef X87_ASM
 	ld = qfldbl(x);
@@ -1855,7 +1855,7 @@ struct qfloat qfsqrt(struct qfloat x)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFSQRT!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFSQRT!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFSQRT!\n");
 	}
 #endif
 	/* Multiply 1/sqrt(x) by x to get sqrt(x). */
@@ -1871,7 +1871,7 @@ struct qfloat qisqrt(struct qfloat q)
 	/* Make sure q is nonnegative. This also catches potential divides-by-zero. */
 	if(q.hi >> 63)
 	{
-		ASSERT(HERE, 0,"ERROR: sqrt of a negative number not supported.");
+		ASSERT(0,"ERROR: sqrt of a negative number not supported.");
 	}
 	else if(qfcmpeq(q, QZRO))
 	{
@@ -1933,7 +1933,7 @@ struct qfloat qfagm(struct qfloat x, struct qfloat y)
 			if(idiff < 8) break;
 		}
 	}
-	ASSERT(HERE, (i < 20), "Failure to converge in QFAGM!");
+	ASSERT((i < 20), "Failure to converge in QFAGM!");
 	return a;
 }
 
@@ -2014,14 +2014,14 @@ struct qfloat qflog(struct qfloat x)
 #endif
 	struct qfloat expy;
 
-	ASSERT(HERE, qfcmplt(QZRO,x), "Arg must be > 0 in QFLOG!");
+	ASSERT(qfcmplt(QZRO,x), "Arg must be > 0 in QFLOG!");
 
 #if 0	// Algo 0
 
 	uint32 efield,k;
 	// Find smallest k s.t. y = (2^k)*x >= 2^64:
 	/* Extract 11-bit exponent field and add sign-extended power-of-2 exponent: */
-	efield = ((x.hi >> 52) & MASK_EXP);	ASSERT(HERE, efield,"Denormalized numbers not currently supported in QFLOG");
+	efield = ((x.hi >> 52) & MASK_EXP);	ASSERT(efield,"Denormalized numbers not currently supported in QFLOG");
 	// 1.0 has efield = 0x3FF, so use that to compute k:
 	k = 64 - (efield - 0x3FF);
 	y = qfmul_pow2(x, (uint64)k);
@@ -2077,7 +2077,7 @@ struct qfloat qflog(struct qfloat x)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFLOG!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFLOG!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFLOG!\n");
 	}
 #endif
 	return y;
@@ -2142,8 +2142,8 @@ struct qfloat qfexp(struct qfloat x)
 	pow2 = (xabs.hi >> 52) - 0x3fd;
 	if(abs(pow2) > 9) {	// If arg > +- 512, check for over/underflow which occurs for |arg| ~> 700
 		if(darg > 700) {
-			fprintf(stderr,"QFEXP: xabs.hi = %16llX, pow2 = %u, darg = %10.10e\n",xabs.hi,pow2,darg);
-			ASSERT(HERE,0,"expo overflow!");
+			fprintf(stderr,"QFEXP: xabs.hi = %16" PRIX64 ", pow2 = %u, darg = %10.10e\n",xabs.hi,pow2,darg);
+			ASSERT(0,"expo overflow!");
 		} else if(darg < -700) {
 			return QZRO;	// expo underflow flushes to zero
 		}
@@ -2206,7 +2206,7 @@ struct qfloat qfexp(struct qfloat x)
 	} else {
 		nterm_idx = i - 0x3C5;
 	//	printf("Input exp-field = %3X, nterm_idx = %d; \n",i,nterm_idx);
-		ASSERT(HERE, nterm_idx < 64, "nterm_idx ou of range!");
+		ASSERT(nterm_idx < 64, "nterm_idx ou of range!");
 		nterm = nterm_arr[nterm_idx];
 	}
 
@@ -2262,7 +2262,7 @@ struct qfloat qfexp(struct qfloat x)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFEXP!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFEXP!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFEXP!\n");
 	}
 #endif
 	return y;
@@ -2318,7 +2318,7 @@ struct qfloat qfatan(struct qfloat x)
 		if( rerr > 1e-12 ) {
 			WARN(HERE, "High Error Level in QFATAN!\n", "", 0);
 		}
-		ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFATAN!\n");
+		ASSERT(rerr < 1e-2, "Fatal ROE in QFATAN!\n");
 	}
 #endif
 	return y;
@@ -2458,7 +2458,7 @@ struct qfloat qfcos	(struct qfloat q)
 	i  = qfint(qt);				// ...And truncate that to the next-smaller integer (128-bit int in this case).
 	// For the quadrant, we only need the result modulo 4:
 	quad = i.d0 & (uint64)3;
-	ASSERT(HERE, !i.d1 && (int64)i.d0 >= 0,"QFCOS: quadrant error");
+	ASSERT(!i.d1 && (int64)i.d0 >= 0,"QFCOS: quadrant error");
 	qt = i64_to_q((int64)i.d0);
 	// The calling argument is q mod pi/2:
 	q = qfsub(q, qfmul(qt, QPIHALF));
@@ -2483,7 +2483,7 @@ struct qfloat qfsin	(struct qfloat q)
 	i  = qfint(qt);				// ...And truncate that to the next-smaller integer (128-bit int in this case).
 	// For the quadrant, we only need the result modulo 4:
 	quad = i.d0 & (uint64)3;
-	ASSERT(HERE, !i.d1 && (int64)i.d0 >= 0,"QFSIN: quadrant error");
+	ASSERT(!i.d1 && (int64)i.d0 >= 0,"QFSIN: quadrant error");
 	qt = i64_to_q((int64)i.d0);
 	// The calling argument is q mod pi/2:
 	q = qfsub(q, qfmul(qt, QPIHALF));
@@ -2563,7 +2563,7 @@ struct qfloat qfcs1(struct qfloat q)
 	static int first_entry = TRUE;
 	if(first_entry) {
 		first_entry = FALSE;
-		denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(HERE, denoms != NULL, "alloc failed!");
+		denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(denoms != NULL, "alloc failed!");
 		for(i = 4; i < 38; i += 4)	// Must limit largest index into QNINV[] to 40, hence (i+2) < 40
 		{
 			j = (i>>1)-1;
@@ -2573,7 +2573,7 @@ struct qfloat qfcs1(struct qfloat q)
 	}
 #endif
 	/* Make sure argument is in range... */
-	DBG_ASSERT(HERE, (qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 200 in qfloat.c");
+	DBG_ASSERT((qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 200 in qfloat.c");
 
 #if (USE_CHEB_EXPANSION == 1)	// Branchless algorithm:
 
@@ -2621,7 +2621,7 @@ struct qfloat qfcs1(struct qfloat q)
 			break;
 		}
 	}
-	ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged cos(x) summation!");
+	ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged cos(x) summation!");
 
   #elif (USE_CHEB_EXPANSION == 2)
 
@@ -2686,7 +2686,7 @@ struct qfloat qfsn1(struct qfloat q)
 	static int first_entry = TRUE;
 	if(first_entry) {
 		first_entry = FALSE;
-		denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(HERE, denoms != NULL, "alloc failed!");
+		denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(denoms != NULL, "alloc failed!");
 		for(i = 3; i < 38; i += 4)	// Must limit largest index into QNINV[] to 40, hence (i+2) < 40
 		{
 			j = (i>>1);
@@ -2696,7 +2696,7 @@ struct qfloat qfsn1(struct qfloat q)
 	}
 #endif
 	/* Make sure argument is in range... */
-	DBG_ASSERT(HERE, (qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 210 in qfloat.c");
+	DBG_ASSERT((qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 210 in qfloat.c");
 
 #if (USE_CHEB_EXPANSION == 1)	// Branchless algorithm:
 
@@ -2743,7 +2743,7 @@ struct qfloat qfsn1(struct qfloat q)
 			break;
 		}
 	}
-	ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged sin(x) summation!");
+	ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged sin(x) summation!");
 
   #elif (USE_CHEB_EXPANSION == 2)
 
@@ -2835,7 +2835,7 @@ struct qfloat qfcosh(struct qfloat q)
 		static int first_entry = TRUE;
 		if(first_entry) {
 			first_entry = FALSE;
-			denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(HERE, denoms != NULL, "alloc failed!");
+			denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(denoms != NULL, "alloc failed!");
 			for(i = 4; i < 38; i += 4)	// Limit largest index into QNINV[] to 40, hence (i+2) < 40 (--> i_max = 36 here)
 			{
 				j = (i>>1)-1;
@@ -2883,7 +2883,7 @@ struct qfloat qfcosh(struct qfloat q)
 			curr_term = qfmul(curr_term, mult);
 			e_sum = (uint32)((sum      .hi & ~MASK_SIGN) >> 52);
 			e_new = (uint32)((curr_term.hi & ~MASK_SIGN) >> 52);
-			ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged cosh(x) summation!");
+			ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged cosh(x) summation!");
 		}
 		return sum;
 	}	// |x| >= 2 ?
@@ -2904,7 +2904,7 @@ struct qfloat qfsinh(struct qfloat q)
 		static int first_entry = TRUE;
 		if(first_entry) {
 			first_entry = FALSE;
-			denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(HERE, denoms != NULL, "alloc failed!");
+			denoms = (struct qfloat *)malloc(20*qsz);	ASSERT(denoms != NULL, "alloc failed!");
 			for(i = 3; i < 38; i += 4)	// Must limit largest index into QNINV[] to 40, hence (i+2) < 40
 			{
 				j = (i>>1);
@@ -2951,7 +2951,7 @@ struct qfloat qfsinh(struct qfloat q)
 			curr_term = qfmul(curr_term, mult);
 			e_sum = (uint32)((sum      .hi & ~MASK_SIGN) >> 52);
 			e_new = (uint32)((curr_term.hi & ~MASK_SIGN) >> 52);
-			ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged sinh(x) summation!");
+			ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged sinh(x) summation!");
 		}
 		return sum;
 	}	// |x| >= 2 ?
@@ -3013,20 +3013,20 @@ int qtest(void)
 	uint64 *ld_ptr = &ld, x87_mant, x87_sexp;
 
 	// Test I/O functions:
-	ASSERT(HERE, STREQ( qf2str(QPI), "+3.14159265358979323846264338327950289 E+000" ), "I/O test failed!");
+	ASSERT(STREQ( qf2str(QPI), "+3.14159265358979323846264338327950289 E+000" ), "I/O test failed!");
 
 	asm ("fldln2;"
 		 "fstpt %0" : "=m"(ld) : );
 	x87_mant = *ld_ptr; x87_sexp = *(ld_ptr+1) & 0x000000000000FFFFull;	// Mask off high 48 bits of x87_sexp field, as these are uninited
 	if(x87_mant != 0xB17217F7D1CF79ACull) {
 		printf("ln2 = %30.20Le\n", ld);
-		printf("x87_mant = %16llx, expected 0xB17217F7D1CF79ACull\n", x87_mant);	// x87_mant = b17217f7d1cf79ac, left-shift one place to off-shift hidden bit
+		printf("x87_mant = %16" PRIx64 ", expected 0xB17217F7D1CF79ACull\n", x87_mant);	// x87_mant = b17217f7d1cf79ac, left-shift one place to off-shift hidden bit
 		WARN(HERE, "Ln2 long-double mantissa conversion error", "", 0);
 	}
-//	ASSERT(HERE, x87_mant == 0xB17217F7D1CF79ACull, "Ln2 long-double mantissa conversion error");
+//	ASSERT(x87_mant == 0xB17217F7D1CF79ACull, "Ln2 long-double mantissa conversion error");
 
-//	printf("x87_sexp = %16llx\n", x87_sexp);	// x87_sexp = 3ffe, clear high 4 bits to get qfloat/double-compatible exp-field
-	ASSERT(HERE, x87_sexp == 0x0000000000003FFEull, "Ln2 long-double exponent conversion error");
+//	printf("x87_sexp = %16" PRIx64 "\n", x87_sexp);	// x87_sexp = 3ffe, clear high 4 bits to get qfloat/double-compatible exp-field
+	ASSERT(x87_sexp == 0x0000000000003FFEull, "Ln2 long-double exponent conversion error");
 
 	asm ("fld1;"
 		 "fadd %%st(0), %%st(0);"
@@ -3037,17 +3037,17 @@ int qtest(void)
 
 	if(x87_mant != 0xB504F333F9DE6484ull) {
 		printf("-Sqrt2 = %30.20Le\n", ld);
-		printf("x87_mant = %16llx, expected 0xB504F333F9DE6484ull\n", x87_mant);
+		printf("x87_mant = %16" PRIx64 ", expected 0xB504F333F9DE6484ull\n", x87_mant);
 		WARN(HERE, "-Sqrt2 long-double mantissa conversion error", "", 0);
 	}
-//	ASSERT(HERE, x87_mant == 0xB504F333F9DE6484ull, "-Sqrt2 long-double mantissa conversion error");
+//	ASSERT(x87_mant == 0xB504F333F9DE6484ull, "-Sqrt2 long-double mantissa conversion error");
 
-//	printf("x87_sexp = %16llx\n", x87_sexp);
-	ASSERT(HERE, x87_sexp == 0x000000000000BFFFull, "-Sqrt2 long-double exponent conversion error");
+//	printf("x87_sexp = %16" PRIx64 "\n", x87_sexp);
+	ASSERT(x87_sexp == 0x000000000000BFFFull, "-Sqrt2 long-double exponent conversion error");
 
 #endif
 
-	ASSERT(HERE, (ABS((int64)0x1234567890ABCDEFull) == 0x1234567890ABCDEFull), "ERROR 10 in qfloat.c");
+	ASSERT((ABS((int64)0x1234567890ABCDEFull) == 0x1234567890ABCDEFull), "ERROR 10 in qfloat.c");
 
 	/*********** TEST THE TYPE CONVERSIONS **************/
 #if TIMING_TEST
@@ -3060,7 +3060,7 @@ int qtest(void)
 		td += qfdbl(QEXP);
 	}
 	clock2 = clock();
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
 	cycles /= 4.0*(double)titers;
@@ -3069,52 +3069,52 @@ int qtest(void)
 #endif
 	c = 0.0;	d = qfdbl(QZRO);
 #if QFDEBUG
-		printf("dble(0.0) = %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(0.0) = %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 12 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 12 in qfloat.c");
 
 	c = 1.0;	d = qfdbl(QONE);
 #if QFDEBUG
-		printf("dble(1.0) = %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(1.0) = %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 14 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 14 in qfloat.c");
 
 	c = 2.0;	d = qfdbl(QTWO);
 #if QFDEBUG
-		printf("dble(2.0) = %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(2.0) = %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 16 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 16 in qfloat.c");
 
 	c =-2.0;	d = qfdbl(qfneg(QTWO));
 #if QFDEBUG
-		printf("dble(-2.0)= %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(-2.0)= %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 18 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 18 in qfloat.c");
 
 	c = 2*pi;	d = qfdbl(Q2PI);
 #if QFDEBUG
-		printf("dble(2pi) = %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(2pi) = %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 20 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 20 in qfloat.c");
 
 	c =log(2.0);d = qfdbl(QLN2);
 #if QFDEBUG
-		printf("dble(ln2) = %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(ln2) = %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 22 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 22 in qfloat.c");
 
 	c = exp(1.0);
 	d = qfdbl(QEXP);
 #if QFDEBUG
-		printf("dble(exp) = %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(exp) = %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 24 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 24 in qfloat.c");
 
 	c = -c;		d = qfdbl(qfneg(QEXP));
 #if QFDEBUG
-		printf("dble(-exp)= %16llX  %16llX\n",*(int64 *)&c, *(int64 *)&d);
+		printf("dble(-exp)= %16" PRIX64 "  %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d);
 #endif
-	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 26 in qfloat.c");
+	hidiff = *(int64 *)&c - *(int64 *)&d;	if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 26 in qfloat.c");
 
 	/*********** TEST THE MULTIPLY ALGORITHM ************/
 #if TIMING_TEST
@@ -3127,7 +3127,7 @@ int qtest(void)
 		hidiff += qfmul_pow2(QLN2,+1).hi;
 		hidiff += qfmul_pow2(QLN2,+1).hi;
 	}
-	ASSERT(HERE, hidiff, "!");
+	ASSERT(hidiff, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3140,7 +3140,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfmul(QEXP,QEXP));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3150,65 +3150,65 @@ int qtest(void)
 	/* e*e: 	0x401D8E64B8D4DDAD, 0xCC33A3BA206B68AC	*/
 	q = qfmul(QEXP,QEXP);
 #if QFDEBUG
-		printf("      e*e  = %16llX  %16llX\n",q.hi,q.lo);
+		printf("      e*e  = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 		printf("dble( e*e) = %25.16e\n",qfdbl(q));
 #endif
 	qref.hi = 0x401D8E64B8D4DDADull;	qref.lo = 0xCC33A3BA206B68ACull;
 	// This is better than the separate hi/lo-word test, since it allows for the ROE to be of either sign:
 	qerr = qfabs(qfsub(q,qref));			// Div-by-eps same as mul-by-by-2^118
 	derr = qfdbl( qfmul_pow2(qerr,+118) );	// The threshold here typically needs to be ~16*[magnitude of output]
-	ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!");
 
 	/* ln2*e:	0x3FFE258ECC242F82, 0x5DEC567E6A0E1111	*/
 	q = qfmul(QLN2,QEXP);
 #if QFDEBUG
-		printf("     L2*e  = %16llX  %16llX\n",q.hi,q.lo);
+		printf("     L2*e  = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 		printf("dble(L2*e) = %25.16e\n",qfdbl(q));
 #endif
 	qref.hi = 0x3FFE258ECC242F82ull;	qref.lo = 0x5DEC567E6A0E1111ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!");
 
 	/* ln2^2:	0x3FDEBFBDFF82C58E, 0xA86F16B06EC97360	*/
 	q = qfmul(QLN2,QLN2);
 #if QFDEBUG
-		printf("     L2^2  = %16llX  %16llX\n",q.hi,q.lo);
+		printf("     L2^2  = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 		printf("dble(L2^2) = %25.16e\n",qfdbl(q));
 #endif
 	qref.hi = 0x3FDEBFBDFF82C58Eull;	qref.lo = 0xA86F16B06EC97360ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!");
 
 	/* ln2*2pi:	0x40116BB24190A0B6, 0xE765BE0D06135E60	*/
 	q = qfmul(QLN2,Q2PI);
 #if QFDEBUG
-		printf("     Ln2*pi = %16llX  %16llX\n",q.hi,q.lo);
+		printf("     Ln2*pi = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 		printf("dble(Ln2*pi)= %25.16e\n",qfdbl(q));
 #endif
 	qref.hi = 0x40116BB24190A0B6ull;	qref.lo = 0xE765BE0D06135E60ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!");
 
 	/* 2pi*e:	0x403114580B45D474, 0x9E6108579A2D0CA7	*/
 	q = qfmul(Q2PI,QEXP);
 #if QFDEBUG
-		printf("     pi*e  = %16llX  %16llX\n",q.hi,q.lo);
+		printf("     pi*e  = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 		printf("dble(pi*e) = %25.16e\n",qfdbl(q));
 #endif
 	qref.hi = 0x403114580B45D474ull;	qref.lo = 0x9E6108579A2D0CA7ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 128.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 128.0 ,"ERROR in QFMUL error-level check!");
 
 	/* 2pi*2pi:	0x4043BD3CC9BE45DE, 0x5A4ADC4D9B301183	*/
 	q = qfmul(Q2PI,Q2PI);
 #if QFDEBUG
-		printf("  (2*pi)^2 = %16llX  %16llX\n",q.hi,q.lo);
+		printf("  (2*pi)^2 = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 		printf("dble(2pi^2)= %25.16e\n",qfdbl(q));
 		printf("dble(2pi^2)= %25.16e\n",pi*pi);
 #endif
 	qref.hi = 0x4043BD3CC9BE45DEull;	qref.lo = 0x5A4ADC4D9B301183ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 128.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 128.0 ,"ERROR in QFMUL error-level check!");
 
 	/*********** TEST THE ADDITION ALGORITHM ************/
 #if TIMING_TEST
@@ -3217,7 +3217,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfadd(QEXP,QEXP));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3227,11 +3227,11 @@ int qtest(void)
 	/* 2*pi+e:	0x402200C04CE72C66, 0x7821CB48D9B947AC	*/
 	q = qfadd(QEXP,Q2PI);
 #if QFDEBUG
-		printf("  2*pi + e = %16llX  %16llX\n",q.hi,q.lo);
+		printf("  2*pi + e = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x402200C04CE72C66ull;	qref.lo = 0x7821CB48D9B947ACull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!");
+	ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!");
 
 	/********** TEST THE SUBTRACTION ALGORITHM **********/
 #if TIMING_TEST
@@ -3240,7 +3240,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfsub(QEXP,QEXP));
 	}
-	ASSERT(HERE, td == 0.0, "!");
+	ASSERT(td == 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3251,69 +3251,69 @@ int qtest(void)
 	q.hi = 0x3FEFFFFFFFFFFFFFull;	q.lo = 0xFFFFFFFFFFFFFFFFull;
 	q = qfsub(q, q);
 #if QFDEBUG
-		printf("result1 = %16llX  %16llX\n",q.hi,q.lo);
+		printf("result1 = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = qref.lo = 0x0000000000000000ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!");
+	ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!");
 
 	p.hi = 0x3FEFFFFFFFFFFFFFull;	p.lo = 0xFFFFFFFFFFFFFFFFull;
 	q.hi = 0x3FEFFFFFFFFFFFFFull;	q.lo = 0xFFFFFFFFFFFFFFFEull;
 	q = qfsub(p, q);
 #if QFDEBUG
-		printf("result2 = %16llX  %16llX\n",q.hi,q.lo);
+		printf("result2 = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x38A0000000000000ull;	qref.lo = 0x0000000000000000ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!");
+	ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!");
 
 	/* Both inputs   normalized, output denormalized, with just one significant bit. */
 	p.hi = 0x00FFFFFFFFFFFFFFull;	p.lo = 0xFFFFFFFFFFFFFFFFull;
 	q.hi = 0x00FFFFFFFFFFFFFFull;	q.lo = 0xFFFFFFFFFFFFFFFEull;
 	q = qfsub(p, q);
 #if QFDEBUG
-		printf("result3 = %16llX  %16llX\n",q.hi,q.lo);
+		printf("result3 = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x0000000000000000ull;	qref.lo = 0x0000000000004000ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!");
+	ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!");
 
 	/* Both inputs denormalized, output zero */
 	q.hi = 0x000FFFFFFFFFFFFFull;	q.lo = 0xFFFFFFFFFFFFFFFFull;
 	q = qfsub(q, q);
 #if QFDEBUG
-		printf("result4 = %16llX  %16llX\n",q.hi,q.lo);
+		printf("result4 = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = qref.lo = 0ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!");
+	ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!");
 
 	/* Both inputs denormalized, output denormalized, with just one significant bit. */
 	p.hi = 0x000FFFFFFFFFFFFFull;	p.lo = 0xFFFFFFFFFFFFFFFFull;
 	q.hi = 0x000FFFFFFFFFFFFFull;	q.lo = 0xFFFFFFFFFFFFFFFEull;
 	q = qfsub(p, q);
 #if QFDEBUG
-		printf("result5 = %16llX  %16llX\n",q.hi,q.lo);
+		printf("result5 = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0ull;	qref.lo = 1ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!");
+	ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!");
 
 	/* 2*pi-e:	0x400C84EC1D7402C7, 0x39DB360DDEDB4F60	*/
 	q = qfsub(Q2PI,QEXP);
 #if QFDEBUG
-		printf("    2pi- e = %16llX  %16llX\n",q.hi,q.lo);
+		printf("    2pi- e = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x400C84EC1D7402C7ull;	qref.lo = 0x39DB360DDEDB4F60ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!");
+	ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!");
 
 	/* e-2*pi:	0xC00C84EC1D7402C7, 0x39DB360DDEDB4F60	*/
 	r = qfsub(QEXP,Q2PI);
 #if QFDEBUG
-		printf("     e-2pi = %16llX  %16llX\n",r.hi,r.lo);
+		printf("     e-2pi = %16" PRIX64 "  %16" PRIX64 "\n",r.hi,r.lo);
 #endif
-	if(!(qfcmpeq(r, qfneg(q)))) ASSERT(HERE, 0,"ERROR 54 in qfloat.c");
+	if(!(qfcmpeq(r, qfneg(q)))) ASSERT(0,"ERROR 54 in qfloat.c");
 
 	/*********** TEST THE SQUARE ROOT ALGORITHM ************/
 #if TIMING_TEST
@@ -3322,7 +3322,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfsqrt(QEXP));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3332,7 +3332,7 @@ int qtest(void)
 	/* sqrt(2):	0x3FF6A09E667F3BCC, 0x908B2FB1366EA958, qfsqrt gives ...956. */
 	q = qfsqrt(QTWO);
 #if QFDEBUG
-		printf("sqrt(2) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("sqrt(2) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FF6A09E667F3BCCull;	qref.lo = 0x908B2FB1366EA958ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3340,7 +3340,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFSQRT error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFSQRT error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFSQRT error-level check!");
 
 	/*********** TEST THE INVERSION AND DIVISION ALGORITHMS ************/
 #if TIMING_TEST
@@ -3349,7 +3349,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfinv(QEXP));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3359,7 +3359,7 @@ int qtest(void)
 	/* 1/(2*pi):0x3FC45F306DC9C882, 0xA53F84EAFA3EA69B(B81B...), qfinv gives ...698. */
 	q = qfinv(Q2PI);
 #if QFDEBUG
-		printf("1/(2*pi) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("1/(2*pi) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FC45F306DC9C882ull;	qref.lo = 0xA53F84EAFA3EA69Bull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3367,12 +3367,12 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFINV error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFINV error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFINV error-level check!");
 
 	/* 1/e:		0x3FD78B56362CEF37, 0xC6AEB7B1E0A4153E(4376...), qfinv gives ...53C. */
 	q = qfinv(QEXP);
 #if QFDEBUG
-		printf("1/e      = %16llX  %16llX\n",q.hi,q.lo);
+		printf("1/e      = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FD78B56362CEF37ull;	qref.lo = 0xC6AEB7B1E0A4153Eull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3380,12 +3380,12 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFINV error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFINV error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFINV error-level check!");
 
 	/* 1/ln2:	0x3FF71547652B82FE, 0x1777D0FFDA0D23A7(D11D...), qfinv gives ...3A6. */
 	q = qfinv(QLN2);
 #if QFDEBUG
-		printf("1/ln(2)  = %16llX  %16llX\n",q.hi,q.lo);
+		printf("1/ln(2)  = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FF71547652B82FEull;	qref.lo = 0x1777D0FFDA0D23A7ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3393,7 +3393,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFINV error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFINV error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFINV error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3401,7 +3401,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfdiv(QEXP,QPI));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3411,7 +3411,7 @@ int qtest(void)
 	/* 2*pi/ln2:0x40222123045B5DEB, 0x9C5398CE82C06E4B(80DB...), qfdiv gives ...E4A. */
 	q = qfdiv(Q2PI, QLN2);
 #if QFDEBUG
-		printf("2*pi/ln(2)  = %16llX  %16llX\n",q.hi,q.lo);
+		printf("2*pi/ln(2)  = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x40222123045B5DEBull;	qref.lo = 0x9C5398CE82C06E4Bull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3419,7 +3419,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFDIV error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 128.0 ,"ERROR in QFDIV error-level check!");
+//	ASSERT(derr < 128.0 ,"ERROR in QFDIV error-level check!");
 
 	/*********** TEST THE TRANSCENDENTAL FUNCTIONS ************/
 #if TIMING_TEST
@@ -3428,7 +3428,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfsn1(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3443,7 +3443,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFSIN error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFSIN error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFSIN error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3451,7 +3451,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfcs1(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3466,7 +3466,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFCOS error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFCOS error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFCOS error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3474,7 +3474,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qftan(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3484,7 +3484,7 @@ int qtest(void)
 	/* tan(Pi/4):	Compare to 1: */
 	q = qftan(QPI4TH);
 #if QFDEBUG
-		printf("qtfan(PI/4) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qtfan(PI/4) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = QONE.hi;	qref.lo = QONE.lo;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3492,7 +3492,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFTAN error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFTAN error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFTAN error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3500,7 +3500,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfcot(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3510,7 +3510,7 @@ int qtest(void)
 	/* cot(Pi/4):	Compare to 1: */
 	q = qfcot(QPI4TH);
 #if QFDEBUG
-		printf("qfcot(PI/4) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qfcot(PI/4) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = QONE.hi;	qref.lo = QONE.lo;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3518,7 +3518,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFCOT error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFCOT error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFCOT error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3526,7 +3526,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfatan(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3536,7 +3536,7 @@ int qtest(void)
 	/* atan(1):	Compare to precomputed Pi/4: */
 	q = qfatan(QONE);
 #if QFDEBUG
-		printf("qatan(1) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qatan(1) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = QPI4TH.hi;	qref.lo = QPI4TH.lo;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3544,7 +3544,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFATAN error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFATAN error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFATAN error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3552,7 +3552,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qflog(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3562,18 +3562,18 @@ int qtest(void)
 	/* log(2):	Compare to precomputed QLN2 = {0x3FE62E42FEFA39EFull, 0x35793C7673007E5Full}: */
 	q = qflog(QTWO);
 #if QFDEBUG
-		printf("qlog(2) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qlog(2) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FE62E42FEFA39EFull;	qref.lo = 0x35793C7673007E5Full;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
-	ASSERT(HERE, derr < 1100.0,"ERROR in QFLOG error-level check!");	// AGM-based log is fast but error-prone
+	ASSERT(derr < 1100.0,"ERROR in QFLOG error-level check!");	// AGM-based log is fast but error-prone
 
 	/* log(2^64):	Compare to precomputed log(2^64) = (same as log(2) but exp-field += 6): */
 	q = qfmul_pow2(QONE,+64);
 	q = qflog(q);
 
 #if QFDEBUG
-		printf("qlog(2^64) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qlog(2^64) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x40462E42FEFA39EFull;	qref.lo = 0x35793C7673007E5Full;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3581,7 +3581,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFLOG error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 1100.0 ,"ERROR in QFLOG error-level check!");
+//	ASSERT(derr < 1100.0 ,"ERROR in QFLOG error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3589,7 +3589,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfexp(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3599,7 +3599,7 @@ int qtest(void)
 	/* exp(1):	0x4005BF0A8B145769, 0x5355FB8AC404E7A7(9E3B...), qfexp gives ...4E7A7, ~116 bits of accuracy. */
 	q = qfexp(QONE);
 #if QFDEBUG
-		printf("qexp(1) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qexp(1) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x4005BF0A8B145769ull;	qref.lo = 0x5355FB8AC404E7A7ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3607,13 +3607,13 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFEXP error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr <= 64.0 ,"ERROR in QFEXP error-level check!");
+//	ASSERT(derr <= 64.0 ,"ERROR in QFEXP error-level check!");
 
 	/* Sine and cosine are somewhat roundoff-error prone, so raise the error limit slightly. */
 	/* cos(1):	0x3FE14A280FB5068B, 0x923848CDB2ED0E37(A534...), qfcs1 gives ...D0E38, ~116 bits of accuracy */
 	q = qfcs1(QONE);
 #if QFDEBUG
-		printf("qcs1(1) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qcs1(1) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FE14A280FB5068Bull;	qref.lo = 0x923848CDB2ED0E37ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3621,18 +3621,18 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFCS1 error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFCS1 error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFCS1 error-level check!");
 
 	r = qfcos(QONE);
 #if QFDEBUG
-		printf("qcos(1) = %16llX  %16llX\n",r.hi,r.lo);
+		printf("qcos(1) = %16" PRIX64 "  %16" PRIX64 "\n",r.hi,r.lo);
 #endif
-	if(!(qfcmpeq(r, q))) ASSERT(HERE, 0,"ERROR 70 in qfloat.c");
+	if(!(qfcmpeq(r, q))) ASSERT(0,"ERROR 70 in qfloat.c");
 
 	/* sin(1):	0x3FEAED548F090CEE, 0x0418DD3D2138A1E7(8651...), qfsn1 gives ...8A1E9, ~115 bits of accuracy */
 	q = qfsn1(QONE);
 #if QFDEBUG
-		printf("qsn1(1) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qsn1(1) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FEAED548F090CEEull;	qref.lo = 0x0418DD3D2138A1E7ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3640,18 +3640,18 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFSN1 error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFSN1 error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFSN1 error-level check!");
 
 	r = qfsin(QONE);
 #if QFDEBUG
-		printf("qsin(1) = %16llX  %16llX\n",r.hi,r.lo);
+		printf("qsin(1) = %16" PRIX64 "  %16" PRIX64 "\n",r.hi,r.lo);
 #endif
-	if(!(qfcmpeq(r, q))) ASSERT(HERE, 0,"ERROR 74 in qfloat.c");
+	if(!(qfcmpeq(r, q))) ASSERT(0,"ERROR 74 in qfloat.c");
 
 	/* cos(100):0x3FEB981DBF665FDF, 0x63F433736617A041(5D8A...), qfcos gives ...7A023, ~114 bits of accuracy */
 	q = qfcos(i64_to_q((int64)100));
 #if QFDEBUG
-		printf("qcos(100) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qcos(100) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0x3FEB981DBF665FDFull;	qref.lo = 0x63F433736617A041ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3659,12 +3659,12 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFCOS error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 128.0 ,"ERROR in QFCOS error-level check!");
+//	ASSERT(derr < 128.0 ,"ERROR in QFCOS error-level check!");
 
 	/* sin(100):0xBFE03425B78C4DB8, 0x0708F6155D083EB2(1C6B...), qfsin gives ...83EE5, ~109 bits of accuracy */
 	q = qfsin(i64_to_q((int64)100));
 #if QFDEBUG
-		printf("qsin(100) = %16llX  %16llX\n",q.hi,q.lo);
+		printf("qsin(100) = %16" PRIX64 "  %16" PRIX64 "\n",q.hi,q.lo);
 #endif
 	qref.hi = 0xBFE03425B78C4DB8ull;	qref.lo = 0x0708F6155D083EB2ull;
 	qerr = qfabs(qfsub(q,qref));	derr = qfdbl( qfmul_pow2(qerr,+118) );
@@ -3672,7 +3672,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFSIN error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 128.0 ,"ERROR in QFSIN error-level check!");
+//	ASSERT(derr < 128.0 ,"ERROR in QFSIN error-level check!");
 
 	/*********** Test the hyperbolic-trigs: **********************/
 #if TIMING_TEST
@@ -3681,7 +3681,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfsinh(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3696,7 +3696,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFSINH error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFSINH error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFSINH error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3704,7 +3704,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qfcosh(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3719,7 +3719,7 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFCOSH error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFCOSH error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFCOSH error-level check!");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3727,7 +3727,7 @@ int qtest(void)
 	for(i = 0; i < titers; ++i) {
 		td += qfdbl(qftanh(QLN2));
 	}
-	ASSERT(HERE, td != 0.0, "!");
+	ASSERT(td != 0.0, "!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3742,11 +3742,11 @@ int qtest(void)
 		printf("derr = %10.5f\n", derr);
 		WARN(HERE, "ERROR in QFTANH error-level check!", "", 0);
 	}
-//	ASSERT(HERE, derr < 16.0 ,"ERROR in QFTANH error-level check!");
+//	ASSERT(derr < 16.0 ,"ERROR in QFTANH error-level check!");
 
 	/*********** TEST THE INT --> QFLOAT and ROUND-TOWARD-ZERO AND ROUND-TO-NEAREST FUNCTIONS ************/
-	ASSERT(HERE, CMPEQ128( qfint(qfneg( i64_to_q(  0ull))), NIL128 ), "error!");
-	ASSERT(HERE, CMPEQ128( qfint(qfneg(i128_to_q(NIL128))), NIL128 ), "error!");
+	ASSERT(CMPEQ128( qfint(qfneg( i64_to_q(  0ull))), NIL128 ), "error!");
+	ASSERT(CMPEQ128( qfint(qfneg(i128_to_q(NIL128))), NIL128 ), "error!");
 #if TIMING_TEST
 	clock1 = clock();
 	hidiff = lodiff = 0ull;
@@ -3755,7 +3755,7 @@ int qtest(void)
 		hidiff += i128.d1;
 		lodiff += i128.d0;
 	}
-	ASSERT(HERE, !hidiff && (lodiff == titers), "!");	// NINT(ln2) = 1, titers times
+	ASSERT(!hidiff && (lodiff == titers), "!");	// NINT(ln2) = 1, titers times
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
@@ -3765,9 +3765,9 @@ int qtest(void)
 	q = qfmul_pow2(QONE, -1);
 	i128 = qfnint(q);
 #if QFDEBUG
-		printf("qfnint(0.5) = %16llX  %16llX\n",i128.d1,i128.d0);
+		printf("qfnint(0.5) = %16" PRIX64 "  %16" PRIX64 "\n",i128.d1,i128.d0);
 #endif
-	ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)1),"ERROR 80 in qfloat.c");
+	ASSERT((!i128.d1 && i128.d0 == (uint64)1),"ERROR 80 in qfloat.c");
 
 #if TIMING_TEST
 	clock1 = clock();
@@ -3777,25 +3777,25 @@ int qtest(void)
 		hidiff += i128.d1;
 		lodiff += i128.d0 + qfint(QPI).d0;
 	}
-	ASSERT(HERE, !hidiff && (lodiff == 3*titers), "!");	// INT(ln2) = 0 and INT(pi) = 3, summed (titers) times
+	ASSERT(!hidiff && (lodiff == 3*titers), "!");	// INT(ln2) = 0 and INT(pi) = 3, summed (titers) times
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC;
 	cycles /= (double)titers;
 	printf	("qfint   : cycles/operation = %10.2f\n",cycles - cycles_for_qfdbl);
 #endif
-	i128 = qfnint(QHALF);	ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)1),"ERROR 82 in qfloat.c");
-	i128 = qfint(QHALF);	ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)0),"ERROR 83 in qfloat.c");
-	i128 = qfnint(QEXP);	ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)3),"ERROR 84 in qfloat.c");
-	i128 = qfint(QEXP);		ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)2),"ERROR 85 in qfloat.c");
-	i128 = qfnint(Q2PI);	ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)6),"ERROR 86 in qfloat.c");
-	i128 = qfint(Q2PI);		ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)6),"ERROR 87 in qfloat.c");
+	i128 = qfnint(QHALF);	ASSERT((!i128.d1 && i128.d0 == (uint64)1),"ERROR 82 in qfloat.c");
+	i128 = qfint(QHALF);	ASSERT((!i128.d1 && i128.d0 == (uint64)0),"ERROR 83 in qfloat.c");
+	i128 = qfnint(QEXP);	ASSERT((!i128.d1 && i128.d0 == (uint64)3),"ERROR 84 in qfloat.c");
+	i128 = qfint(QEXP);		ASSERT((!i128.d1 && i128.d0 == (uint64)2),"ERROR 85 in qfloat.c");
+	i128 = qfnint(Q2PI);	ASSERT((!i128.d1 && i128.d0 == (uint64)6),"ERROR 86 in qfloat.c");
+	i128 = qfint(Q2PI);		ASSERT((!i128.d1 && i128.d0 == (uint64)6),"ERROR 87 in qfloat.c");
 	q = qfmul_pow2(Q2PI, 20);
-	i128 = qfnint(q);		ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)6588397),"ERROR 90 in qfloat.c");
+	i128 = qfnint(q);		ASSERT((!i128.d1 && i128.d0 == (uint64)6588397),"ERROR 90 in qfloat.c");
 
 	q = qfmul_pow2(QPI, 125);	/* This gives pi*2^125, which should still fit into a signed 128-bit int. */
 	i128 = qfnint(q);
-	ASSERT(HERE, (i128.d1 = (uint64)0x6487ED5110B4611Aull && i128.d0 == (uint64)0x62633145C06E1000ull),"ERROR 92 in qfloat.c");
+	ASSERT((i128.d1 = (uint64)0x6487ED5110B4611Aull && i128.d0 == (uint64)0x62633145C06E1000ull),"ERROR 92 in qfloat.c");
 
 #if TIMING_TEST
 	exit(0);
diff --git a/src/qfloat.h b/src/qfloat.h
index f6997c3c..9fedd712 100755
--- a/src/qfloat.h
+++ b/src/qfloat.h
@@ -206,8 +206,8 @@ struct qfloat qfcos_or_sin1(struct qfloat q, int cos_or_sin);
 #define QLSHIFT(__x, __n, __y)\
 {\
 	/* Make sure sign/exp fields have been cleared and shift count >= 0: */\
-	ASSERT(HERE, (__x.hi>>52) == 0,"QLSHIFT: sign/exp fields not zero!");\
-	ASSERT(HERE, (int64)__n >= 0,"QLSHIFT: (int64)__n >= 0");\
+	ASSERT((__x.hi>>52) == 0,"QLSHIFT: sign/exp fields not zero!");\
+	ASSERT((int64)__n >= 0,"QLSHIFT: (int64)__n >= 0");\
 	/* Need to handle zero shift count separately: */\
 	if(__n == 0)\
 	{\
@@ -230,15 +230,15 @@ struct qfloat qfcos_or_sin1(struct qfloat q, int cos_or_sin);
 		__y.lo = (uint64)0;\
 	}\
 	/* Make sure exp field at most 1 after shift: */\
-	ASSERT(HERE, (__x.hi>>52) <= 1,"QLSHIFT: exp field out of range on output!");\
+	ASSERT((__x.hi>>52) <= 1,"QLSHIFT: exp field out of range on output!");\
 }
 
 /* (Logical) Right-shift: */
 #define QRSHIFT(__x, __n, __y)\
 {\
 	/* Make sure sign/exp fields have been cleared and shift count >= 0: */\
-	ASSERT(HERE, (__x.hi>>52) == 0,"QRSHIFT:  sign/exp fields not zero!");\
-	ASSERT(HERE, (int64)(__n) >= 0,"QRSHIFT: (int64)(__n) >= 0 !");\
+	ASSERT((__x.hi>>52) == 0,"QRSHIFT:  sign/exp fields not zero!");\
+	ASSERT((int64)(__n) >= 0,"QRSHIFT: (int64)(__n) >= 0 !");\
 	/* Need to handle zero shift count separately: */\
 	if((__n) == 0)\
 	{\
diff --git a/src/radix1008_ditN_cy_dif1.c b/src/radix1008_ditN_cy_dif1.c
index 3961c967..8bb5796b 100755
--- a/src/radix1008_ditN_cy_dif1.c
+++ b/src/radix1008_ditN_cy_dif1.c
@@ -422,11 +422,11 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -465,7 +465,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -475,7 +475,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -518,24 +518,24 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix1008_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix1008_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -568,13 +568,13 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, half_arr_offset1008 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT(half_arr_offset1008 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
 			j = (1<<(2*(L2_SZ_VD-2))) + 4;	// 16+4 for sse2, 64+4 for avx
 		} else {
 			j = ODD_RADIX<<2;				// 4*ODD_RADIX
 		}
-		ASSERT(HERE, (radix1008_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix1008_creals_in_local_store checksum failed!");
+		ASSERT((radix1008_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix1008_creals_in_local_store checksum failed!");
 
 		// Roots for radix-16 DFTs:
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one  , 1.0  );
@@ -651,7 +651,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1025,12 +1025,12 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1065,7 +1065,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1116,7 +1116,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 					break;
 				};
 			}	//	printf("wts_idx_incr = %u\n",wts_idx_incr);
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1550,8 +1550,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1561,8 +1561,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1571,26 +1571,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 		#ifdef USE_AVX
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1600,8 +1600,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1822,7 +1822,7 @@ for(outer=0; outer <= 1; outer++)
 	/*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/
 	for(j = 0; j < main_work_units; ++j)
 	{
-		ASSERT(HERE, 0x0 == cy1008_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy1008_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1832,7 +1832,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2632,8 +2632,8 @@ void radix1008_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2697,23 +2697,23 @@ void radix1008_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 	{
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix1008_main_carry_loop.h b/src/radix1008_main_carry_loop.h
index 11303f91..d3927fa3 100755
--- a/src/radix1008_main_carry_loop.h
+++ b/src/radix1008_main_carry_loop.h
@@ -229,7 +229,7 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 						// (and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1).
 			// *But*: since the init macro does an on-the-fly version of this between j,j+2 portions, external code co2=co3 must come *after* both ctmp-data octets are inited.
 		  #ifdef USE_AVX512
-			ASSERT(HERE, 0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!");
+			ASSERT(0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!");
 		  #endif
 			AVX_cmplx_carry_fast_wtsinit_X8(add1,add2,add3, itmp, half_arr,sign_mask, n_minus_sil,n_minus_silp1,sinwt,sinwtm1, sse_bw,sse_n)
 
diff --git a/src/radix1024_ditN_cy_dif1.c b/src/radix1024_ditN_cy_dif1.c
index 70206319..5ecf1504 100755
--- a/src/radix1024_ditN_cy_dif1.c
+++ b/src/radix1024_ditN_cy_dif1.c
@@ -221,7 +221,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
 	struct complex t[RADIX], *tptr;
-	double *addr,*addi;
+	const double *addr,*addi;
 	int *itmp,*itm2;	// Pointer into the bjmodn array
 	int err;
 	static int first_entry=TRUE;
@@ -375,11 +375,11 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -418,7 +418,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -428,7 +428,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -471,22 +471,22 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		cslots_in_local_store = radix1024_creals_in_local_store + (20+RADIX/2)/2;	// Just add enough int64 space for both cases, plus some
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix1024_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -520,8 +520,8 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, half_arr_offset1024 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix1024_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix1024_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset1024 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix1024_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix1024_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(isrt2,ISRT2);
@@ -672,7 +672,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 				qc = qfcos(qt);	qs = qfsin(qt);
 				qx = QONE;		qy = QZRO;
 				for(j = 0; j < RADIX; j++) {
-					printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+					printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 					// Up-multiply the complex exponential:
 					qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 					qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1194,14 +1194,14 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/radix-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < NDIVR/CY_THREADS; j++)
@@ -1221,7 +1221,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 			if(CY_THREADS > 1)
 			{
 				for(ithread = 1; ithread < CY_THREADS; ithread++)
@@ -1400,8 +1400,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1411,8 +1411,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1421,20 +1421,20 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
-		ASSERT(HERE, ((tmp + 0x1080)->d0 == ISRT2 && (tmp + 0x1080)->d1 == ISRT2), "thread-local memcheck failed!");
+		ASSERT(((tmp + 0x1080)->d0 == ISRT2 && (tmp + 0x1080)->d1 == ISRT2), "thread-local memcheck failed!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1444,11 +1444,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1461,8 +1461,8 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_SSE2)
 			// This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots:
-			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			// scale gets set immediately prior to calling carry macro, hence no use checking it here.
 			/* init carries	*/
@@ -1685,7 +1685,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy1024_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy1024_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1695,7 +1695,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("%s end  ; #tasks = %d, #free_tasks = %d\n",func, tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1869,7 +1869,7 @@ void radix1024_dif_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n >> 10;
 		p1 = NDIVR;
@@ -2605,7 +2605,7 @@ void radix1024_dit_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n >> 10;
 		p1 = NDIVR;
@@ -3214,8 +3214,8 @@ void radix1024_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -3442,11 +3442,11 @@ void radix1024_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -3456,18 +3456,18 @@ void radix1024_dit_pass1(double a[], int n)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#else	// SSE2:
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		} else {
 		#ifdef USE_AVX512
 			/* No-Op */
 		#else
-			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		}
 
diff --git a/src/radix1024_main_carry_loop.h b/src/radix1024_main_carry_loop.h
index 761bd6e6..8f07752b 100755
--- a/src/radix1024_main_carry_loop.h
+++ b/src/radix1024_main_carry_loop.h
@@ -168,8 +168,8 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee
 		// In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass:
 		if(target_idx == j) {
 		#ifdef USE_SSE2
-			addr = (double *)s1p00 + target_set;
-			*addr += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
+			double *addr_ = (double *)s1p00 + target_set;
+			*addr_ += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
 		#else
 			// target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum:
 			l = target_set&1;	target_set >>= 1;
@@ -470,26 +470,28 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee
 	  if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) {	// LOACC with tunable DWT-weights chaining
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy_r; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy_r;
 		for(ntmp = 0; ntmp < RADIX>>2; ntmp++) {
 			jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp];	// poff[] = p04,08,...
 			// Re-init weights every 4th macro invocation to keep errors under control:
-			cmplx_carry_norm_pow2_errcheck0(a[jt   ],a[jp   ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_pow2_errcheck0(a[jt   ],a[jp   ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  } else {	// HiACC:
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy_r; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy_r;
 		for(ntmp = 0; ntmp < RADIX>>2; ntmp++) {
 			jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp];	// poff[] = p04,08,...
-			cmplx_carry_norm_pow2_errcheck0(a[jt   ],a[jp   ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_pow2_errcheck0(a[jt   ],a[jp   ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  }	// LOACC or HIACC?
@@ -683,13 +685,14 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee
 	#else	// Scalar-double mode:
 
 		// Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2):
-		ntmp = 0; addr = cy_r; addi = cy_i;
+		ntmp = 0;
+		double *addr_ = cy_r, *addi_ = cy_i;
 		for(m = 0; m < RADIX>>2; m++) {
 			jt = j1 + poff[m]; jp = j2 + poff[m];
-			fermat_carry_norm_pow2_errcheck(a[jt   ],a[jp   ],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p1],a[jp+p1],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p2],a[jp+p2],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p3],a[jp+p3],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
+			fermat_carry_norm_pow2_errcheck(a[jt   ],a[jp   ],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p1],a[jp+p1],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p2],a[jp+p2],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p3],a[jp+p3],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
 		}
 
 	#endif	/* #ifdef USE_SSE2 */
diff --git a/src/radix128_ditN_cy_dif1.c b/src/radix128_ditN_cy_dif1.c
index 76e45306..5faef1a7 100755
--- a/src/radix128_ditN_cy_dif1.c
+++ b/src/radix128_ditN_cy_dif1.c
@@ -223,7 +223,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 	// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
-	double *addr, *addi;
+	const double *addr, *addi;
 	struct complex t[RADIX], *tptr;
 	int *itmp,*itm2;	// Pointer into the bjmodn array
 	int err;
@@ -383,11 +383,11 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -426,7 +426,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -436,7 +436,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -479,23 +479,23 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread
 		cslots_in_local_store = radix128_creals_in_local_store + (20+RADIX/2)/2;	// Just add enough int64 space for both cases, plus some
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix128_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -604,8 +604,8 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
-//		ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix128_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix128_creals_in_local_store checksum failed!");
+//		ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix128_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix128_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one  , 1.0  );
@@ -753,7 +753,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				qc = qfcos(qt);	qs = qfsin(qt);
 				qx = QONE;		qy = QZRO;
 				for(j = 0; j < RADIX; j++) {
-					printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+					printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 					// Up-multiply the complex exponential:
 					qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 					qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1168,14 +1168,14 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/radix-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < NDIVR/CY_THREADS; j++)
@@ -1195,7 +1195,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 			if(CY_THREADS > 1)
 			{
 				for(ithread = 1; ithread < CY_THREADS; ithread++)
@@ -1374,8 +1374,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1385,8 +1385,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1395,19 +1395,19 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1417,11 +1417,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1434,8 +1434,8 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_SSE2)
 			// This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots:
-			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			// scale gets set immediately prior to calling carry macro, hence no use checking it here.
 			/* init carries	*/
@@ -1658,7 +1658,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy128_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy128_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1668,7 +1668,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("%s end  ; #tasks = %d, #free_tasks = %d\n",func, tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -2676,8 +2676,8 @@ void radix128_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2823,13 +2823,13 @@ void radix128_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
 		// Must make this check 'fuzzy' to allow for wrong-way-round experiments:
-		ASSERT(HERE, (fabs(isrt2->d0 - ISRT2) < EPS && fabs(isrt2->d1 - ISRT2) < EPS), "thread-local memcheck failed!");
+		ASSERT((fabs(isrt2->d0 - ISRT2) < EPS && fabs(isrt2->d1 - ISRT2) < EPS), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2839,18 +2839,18 @@ void radix128_dit_pass1(double a[], int n)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#else	// SSE2:
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		} else {
 		#ifdef USE_AVX512
 			/* No-Op */
 		#else
-			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		}
 
diff --git a/src/radix128_main_carry_loop.h b/src/radix128_main_carry_loop.h
index 8a6b1df7..8a813bb6 100755
--- a/src/radix128_main_carry_loop.h
+++ b/src/radix128_main_carry_loop.h
@@ -229,8 +229,8 @@ as are the index offsets of each sets of complex outputs in the A-array: [jt,jp]
 		// In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass:
 		if(target_idx == j) {
 		#ifdef USE_SSE2
-			addr = (double *)s1p00 + target_set;
-			*addr += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
+			double *addr_ = (double *)s1p00 + target_set;
+			*addr_ += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
 		#else
 			// target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum:
 			l = target_set&1;	target_set >>= 1;
@@ -531,26 +531,28 @@ as are the index offsets of each sets of complex outputs in the A-array: [jt,jp]
 	  if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) {	// LOACC with tunable DWT-weights chaining
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy_r; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy_r;
 		for(ntmp = 0; ntmp < RADIX>>2; ntmp++) {
 			jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp];	// poff[] = p04,08,...
 			// Re-init weights every 4th macro invocation to keep errors under control:
-			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  } else {	// HiACC:
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy_r; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy_r;
 		for(ntmp = 0; ntmp < RADIX>>2; ntmp++) {
 			jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp];	// poff[] = p04,08,...
-			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  }	// LOACC or HIACC?
@@ -744,13 +746,14 @@ as are the index offsets of each sets of complex outputs in the A-array: [jt,jp]
 	#else	// Scalar-double mode:
 
 		// Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2);
-		ntmp = 0; addr = cy_r; addi = cy_i;
+		ntmp = 0;
+		double *addr_ = cy_r, *addi_ = cy_i;
 		for(m = 0; m < RADIX>>2; m++) {
 			jt = j1 + poff[m]; jp = j2 + poff[m];	// poff[] = p04,08,...
-			fermat_carry_norm_pow2_errcheck(a[jt    ],a[jp    ],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
+			fermat_carry_norm_pow2_errcheck(a[jt    ],a[jp    ],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
 		}
 
 	#endif	/* #ifdef USE_SSE2 */
diff --git a/src/radix12_ditN_cy_dif1.c b/src/radix12_ditN_cy_dif1.c
index 5e6866b6..bca6532c 100755
--- a/src/radix12_ditN_cy_dif1.c
+++ b/src/radix12_ditN_cy_dif1.c
@@ -253,7 +253,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "radix12_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "radix12_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -312,11 +312,11 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -355,7 +355,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -365,7 +365,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -401,18 +401,18 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix12_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix12_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 48 16-byte slots of sc_arr for temporaries, next 2 for the nontrivial complex roots,
 	next 6 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -819,12 +819,12 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		_cy10	= (double *)malloc(j);	ptr_prod += (uint32)(_cy10== 0x0);
 		_cy11	= (double *)malloc(j);	ptr_prod += (uint32)(_cy11== 0x0);
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix12_ditN_cy_dif1.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix12_ditN_cy_dif1.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/20-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix12_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix12_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -848,7 +848,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		first_entry=FALSE;
 	}	/* endif(first_entry) */
@@ -1002,8 +1002,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1013,8 +1013,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1023,20 +1023,20 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1196,7 +1196,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy12_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy12_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1206,7 +1206,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -1259,7 +1259,7 @@ for(outer=0; outer <= 1; outer++)
 
 	for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 	{
-		ASSERT(HERE, CY_THREADS > 1,"radix20_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
+		ASSERT(CY_THREADS > 1,"radix20_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
 		_cy00[ithread] = _cy00[ithread-1];
 		_cy01[ithread] = _cy01[ithread-1];
 		_cy02[ithread] = _cy02[ithread-1];
@@ -1888,8 +1888,8 @@ void radix12_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -1942,17 +1942,17 @@ void radix12_dit_pass1(double a[], int n)
 		r11	= r00 + 0x16;		s1p11 = tmp + 0x16;		half_arr= tmp + 0x23;	/* This table needs 20x16 bytes */
 																// half_arr = r00 + 0x3b; This is where the value of half_arr_offset12 comes from
 	#endif
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 		tmp = half_arr;
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix144_ditN_cy_dif1.c b/src/radix144_ditN_cy_dif1.c
index 885298e2..eb498609 100755
--- a/src/radix144_ditN_cy_dif1.c
+++ b/src/radix144_ditN_cy_dif1.c
@@ -334,7 +334,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -402,11 +402,11 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -444,7 +444,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -454,7 +454,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -494,24 +494,24 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix144_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix144_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix144_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -552,7 +552,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 2 = 0x296; This is where the value of half_arr_offset144 comes from
 		half_arr= tmp + 0x02;	// This table needs 32 x 16 bytes in SSE2 mode
 	  #endif
-		ASSERT(HERE, (radix144_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!");
+		ASSERT((radix144_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );		VEC_DBL_INIT(one, 1.0  );
 	  #if 0	// Here this trick actually degrades accuracy ... must be interaction with the radix-9 DFTs of some kind
@@ -951,12 +951,12 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -980,7 +980,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1132,8 +1132,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1143,8 +1143,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1153,26 +1153,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1305,7 +1305,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy144_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy144_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1315,7 +1315,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2018,8 +2018,8 @@ void radix144_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2134,20 +2134,20 @@ void radix144_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	// This table needs 20 x 16 bytes in SSE2 mode
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix160_ditN_cy_dif1.c b/src/radix160_ditN_cy_dif1.c
index 138a659a..791a96e6 100755
--- a/src/radix160_ditN_cy_dif1.c
+++ b/src/radix160_ditN_cy_dif1.c
@@ -304,7 +304,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -372,11 +372,11 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -414,7 +414,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -424,7 +424,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -464,24 +464,24 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix160_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix160_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix160_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -521,7 +521,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x(290 + 50 + 2) = 0x2e2; This is where the value of half_arr_offset160 comes from
 		half_arr= tmp + 0x02;
 	  #endif
-		ASSERT(HERE, (radix160_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix160_creals_in_local_store checksum failed!");
+		ASSERT((radix160_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix160_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
 	  #if 1
@@ -1308,12 +1308,12 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1337,7 +1337,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1489,8 +1489,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1500,8 +1500,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1510,26 +1510,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1659,7 +1659,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy160_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy160_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1669,7 +1669,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2529,8 +2529,8 @@ void radix160_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -3024,21 +3024,21 @@ void radix160_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix16_dif_dit_pass.c b/src/radix16_dif_dit_pass.c
index 4b96f6ee..8e1f6095 100755
--- a/src/radix16_dif_dit_pass.c
+++ b/src/radix16_dif_dit_pass.c
@@ -210,17 +210,17 @@ void radix16_dif_pass	(double a[],             int n, struct complex rt0[], stru
 	//	fprintf(stderr, "radix16_dif_dit_pass pfetch_dist = %d\n", pfetch_dist);
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage
 			free((void *)sc_arr);	sc_arr=0x0;
 		}
 		// v19 alloc'ed 72* ... v20 needs [1+1+4+8] = 18 more slots in SSE2 mode, [1+1+2+4] = 8 more in AVX/AVX2 mode, [1+1+1+2] = 5 more in AVX-512 mode,
 		// just use 20 more slots in all cases for simplicity's sake. Further add 12 slots for doubled-into-vectors 6-term Chebyshev expansions of cos, sin:
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 104*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 104*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots,
 	last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array.
@@ -332,7 +332,7 @@ void radix16_dif_pass	(double a[],             int n, struct complex rt0[], stru
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		dtmp = (double)12.56637061435917295376/n;	// twopin = 2*pi/[complex FFT length] = 2*pi/(n/2) = 4*pi/n
 		r1 = __r0 + thr_id*104;
 		isrt2 = r1 + 0x20;
@@ -403,7 +403,7 @@ void radix16_dif_pass	(double a[],             int n, struct complex rt0[], stru
 	encounter the same sets or index strides (albeit in opposite order), can split such tests between them:
 	*** 2014: Failure of this assertion led me to find dependence on it in my new AVX2/FMA-based DIT macro ***
 	*** [But fix obviates said dependence, so no longer appropriate to enforce it.] ***
-		ASSERT(HERE, p2  == p1+p1, "radix16_dif_pass: p2  != p1+p1!");
+		ASSERT(p2  == p1+p1, "radix16_dif_pass: p2  != p1+p1!");
 	*/
 	iroot_prim=(incr >> 5);		/* (incr/2)/radix_now */
 	for(m=0; m < nloops; m++)	/* NLOOPS may range from 1 (if first pass radix = 16) to P*N/32 (last pass radix = 16).	 */
@@ -543,8 +543,8 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou
 		// Loop to test various fast alternatives to j/(n>>4) for every j < n/2:
 		for(j = 0; j < (n>>1); j++) {
 			// This fails for e.g. j = 393205 and (n>>4) = 393216:
-			//	ASSERT(HERE, __MULH32(j,imult) == j/(n>>4), "umulh32(j,imult) != j/(n>>4)");
-			ASSERT(HERE, (int)((float)j*fndiv16) == j/(n>>4), "(float)j*fndiv16 != j/(n>>4)");
+			//	ASSERT(__MULH32(j,imult) == j/(n>>4), "umulh32(j,imult) != j/(n>>4)");
+			ASSERT((int)((float)j*fndiv16) == j/(n>>4), "(float)j*fndiv16 != j/(n>>4)");
 		}
 	i = 163397;
 		const double pi4_dbl = (double)0.78539816339744830961, twopin_dbl = 16*pi4_dbl/n;
@@ -556,7 +556,7 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou
 			scos[0] = cos(gamma[j]); scos[1] = sin(gamma[j]);
 			is0[j] = ((unsigned int)io[j] - 2) < 4; is1[j] = (io[j] > 3);
 			jj[j] = IS_ODD((io[j]+1)>>1);
-			ASSERT(HERE, (int)ff[j] == io[j], "ff != io error!");
+			ASSERT((int)ff[j] == io[j], "ff != io error!");
 			twiddle[j].re = sign[is0[j]]*scos[jj[j]]; twiddle[j].im = sign[is1[j]]*scos[jj[j]^1];
 		}
 	#endif
@@ -988,7 +988,7 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou
 		*add2++ = rt;	// cF, will get multiplied by 1/c7 to yield __cF7
 
 		// This places us at add0 == c8 and add1 = c12.
-		ASSERT(HERE, add0 == (double *)cc0+16 && add1 == (double *)cc0+32 && add2 == (double *)cc0+44, "add0,1,2 checksum failed in AVX2 sincos inits!");
+		ASSERT(add0 == (double *)cc0+16 && add1 == (double *)cc0+32 && add2 == (double *)cc0+44, "add0,1,2 checksum failed in AVX2 sincos inits!");
 		/*
 		At this point, the 11 ymm-sized [32-byte] chunks starting at &cc0 contain the following scalar-double data:
 
@@ -1621,7 +1621,7 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou
 		addr += p1;
 		prefetch_p_doubles(addr);
 	  #endif
-		/* Debug: check for overflow of + terms: */	ASSERT(HERE, m1+m17 >= m1 && m2+m18 >= m2,"Overflow of [0,8b] term!");
+		/* Debug: check for overflow of + terms: */	ASSERT(m1+m17 >= m1 && m2+m18 >= m2,"Overflow of [0,8b] term!");
 		a[jt    ]= t1+t17;	a[jp    ]= t2+t18;		b[jt    ]=qreduce( m1+m17   );	b[jp    ]=qreduce( m2+m18   );	// + terms in   0,8b
 		a[jt+p1 ]= t1-t17;	a[jp+p1 ]= t2-t18;		b[jt+p1 ]=qreduce( m1-m17+q4);	b[jp+p1 ]=qreduce( m2-m18+q4);	// - terms in -4b,4b
 		// mpy by E^4=i is inlined here:
@@ -2010,7 +2010,7 @@ void radix16_dit_pass	(double a[],             int n, struct complex rt0[], stru
 #endif
 
 #ifndef COMPILER_TYPE_GCC
-	ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+	ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 #endif
 
 #ifdef USE_SSE2
@@ -2023,15 +2023,15 @@ void radix16_dit_pass	(double a[],             int n, struct complex rt0[], stru
 	{
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage
 			free((void *)sc_arr);	sc_arr=0x0;
 		}
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots,
 	last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array.
@@ -2083,7 +2083,7 @@ void radix16_dit_pass	(double a[],             int n, struct complex rt0[], stru
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		r1 = __r0 + thr_id*72;
 		isrt2 = r1 + 0x20;
 		cc0   = r1 + 0x21;
@@ -2108,9 +2108,9 @@ void radix16_dit_pass	(double a[],             int n, struct complex rt0[], stru
 	// body (both C and ASM). Since such checks may be runlength-dependent, need to be cheap enough to leave on
 	// all the time, as here where we do them just once prior to entering the processing loop. Since DIF and DIT
 	// encounter the same sets or index strides (albeit in opposite order), can split such tests between them:
-	ASSERT(HERE, p4  == p2+p2, "radix16_dit_pass: p4  != p2+p2!");
-	ASSERT(HERE, p8  == p4+p4, "radix16_dit_pass: p8  != p4+p4!");
-	ASSERT(HERE, p12 == p4+p8, "radix16_dit_pass: p12 != p4+p8!");
+	ASSERT(p4  == p2+p2, "radix16_dit_pass: p4  != p2+p2!");
+	ASSERT(p8  == p4+p4, "radix16_dit_pass: p8  != p4+p4!");
+	ASSERT(p12 == p4+p8, "radix16_dit_pass: p12 != p4+p8!");
 
 	iroot_prim=(incr >> 5);		/* (incr/2)/radix_now */
 
@@ -2398,7 +2398,7 @@ void radix16_dit_pass	(double a[],             int n, struct complex rt0[], stru
 		*add1++ = it;	// s15 slot will hold __rF = s15/c15
 
 		// This places us at add0 == c8 and add1 = c12.
-		ASSERT(HERE, add0 == (double *)cc0+16 && add1 == (double *)cc0+32, "add0,1 checksum failed in AVX2 DIT sincos inits!");
+		ASSERT(add0 == (double *)cc0+16 && add1 == (double *)cc0+32, "add0,1 checksum failed in AVX2 DIT sincos inits!");
 		/*
 		At this point, the 8 ymm-sized [32-byte] chunks starting at &cc0 contain the following scalar-double data:
 
@@ -2420,9 +2420,9 @@ void radix16_dit_pass	(double a[],             int n, struct complex rt0[], stru
 		add0[0x00] = c;
 		add0[0x10] = tan;
 		add0[0x20] = 1.0;
-	//	ASSERT(HERE, *(add0-1) == ISRT2, "Scalar ISRT2 bad!");
+	//	ASSERT(*(add0-1) == ISRT2, "Scalar ISRT2 bad!");
 		c_tmp = cc0 + 0x22;	// 1.0 x 4
-	//	ASSERT(HERE, c_tmp->d0 == 1.0 && c_tmp->d0 == c_tmp->d1 && c_tmp->d0 == c_tmp->d2 && c_tmp->d0 == c_tmp->d3, "1.0 x 4 mismatch!");
+	//	ASSERT(c_tmp->d0 == 1.0 && c_tmp->d0 == c_tmp->d1 && c_tmp->d0 == c_tmp->d2 && c_tmp->d0 == c_tmp->d3, "1.0 x 4 mismatch!");
 
 		/* Scalar data starting at add0 = cc0 now laid out as below:
 
diff --git a/src/radix16_dif_dit_pass_asm.h b/src/radix16_dif_dit_pass_asm.h
index 4ea15c1b..75d3ab38 100755
--- a/src/radix16_dif_dit_pass_asm.h
+++ b/src/radix16_dif_dit_pass_asm.h
@@ -49,7 +49,7 @@ The workaround is to use -O1 or higher, whether one is building a debuggable bin
 	{\
 		double *add0,*add1,*add2;\
 		add0 = (double *)__twid_ptr;	/* add0 points to 16 cos-data-to-be-inverted; Need a double-ptr on lhs here */\
-		ASSERT(HERE, add0 != 0x0, "Null add0 pointer!");\
+		ASSERT(add0 != 0x0, "Null add0 pointer!");\
 		add1 = add0 + 16;	/* add1 points to block of memory temporarily used to store the corresponding sine data */\
 		add2 = add0 + 32;	/* add2 points to block of memory temporarily used to store the 11 [0-padded to 12]
 							cosine data which need to be divided by other cosines (i.e. multiplied by inverses) */\
@@ -134,7 +134,7 @@ The workaround is to use -O1 or higher, whether one is building a debuggable bin
 		*add2++ = __cF;	/* cF, will get multiplied by 1/c7 to yield __cF7 */\
 \
 		/* This places us at add0 == c8 and add1 = c12. */\
-		ASSERT(HERE, add0 == (double *)__twid_ptr+16 && add1 == (double *)__twid_ptr+32 && add2 == (double *)__twid_ptr+44, "add0,1,2 checksum failed in AVX2 sincos inits!");\
+		ASSERT(add0 == (double *)__twid_ptr+16 && add1 == (double *)__twid_ptr+32 && add2 == (double *)__twid_ptr+44, "add0,1,2 checksum failed in AVX2 sincos inits!");\
 	/*
 	At this point, the 11 ymm-sized [32-byte] chunks starting at &__twid_ptr contain the following scalar-double data:
 
diff --git a/src/radix16_ditN_cy_dif1.c b/src/radix16_ditN_cy_dif1.c
index 0aa9da4a..804fc52f 100755
--- a/src/radix16_ditN_cy_dif1.c
+++ b/src/radix16_ditN_cy_dif1.c
@@ -409,7 +409,7 @@ int radix16_ditN_cy_dif1		(double a[],             int n, int nwt, int nwt_bits,
 				if(k > 60) k -= 61;
 			}
 		}
-		ASSERT(HERE, isPow2(N2), "N/2 not a power of 2!");
+		ASSERT(isPow2(N2), "N/2 not a power of 2!");
 		l2_n2 = trailz32(N2);
 // ******* For carry step, also need the 16 values of bimodnmod61 for i = j*(n/radix0), j = 0,...,15 ************
 	#endif
@@ -439,11 +439,11 @@ int radix16_ditN_cy_dif1		(double a[],             int n, int nwt, int nwt_bits,
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -482,7 +482,7 @@ int radix16_ditN_cy_dif1		(double a[],             int n, int nwt, int nwt_bits,
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -493,7 +493,7 @@ int radix16_ditN_cy_dif1		(double a[],             int n, int nwt, int nwt_bits,
 
 				main_work_units = 0;
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -537,18 +537,18 @@ int radix16_ditN_cy_dif1		(double a[],             int n, int nwt, int nwt_bits,
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix16_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix16_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots,
 	next 16 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -685,7 +685,7 @@ int radix16_ditN_cy_dif1		(double a[],             int n, int nwt, int nwt_bits,
 			// Up-multiply the complex exponential:
 			qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 			qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
-			printf("j = %3u: cos[j*Pi/2] = 0x%16llX, sin[j*Pi/2] = 0x%16llX\n",j,qfdbl_as_uint64(qx),qfdbl_as_uint64(qy));
+			printf("j = %3u: cos[j*Pi/2] = %#16" PRIX64 ", sin[j*Pi/2] = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx),qfdbl_as_uint64(qy));
 		}
 		exit(0);
 	#endif
@@ -1207,14 +1207,14 @@ half_arr+5*radix	radix		[LOACC-only] inv_mult-lut
 		_cy_iE	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_iE== 0x0);
 		_cy_iF	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_iF== 0x0);
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/16-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < NDIVR/CY_THREADS; j++)
@@ -1235,7 +1235,7 @@ half_arr+5*radix	radix		[LOACC-only] inv_mult-lut
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 		}
 
 		first_entry=FALSE;
@@ -1415,8 +1415,8 @@ for(outer=0; outer <= 1; outer++)
 	for(ithread = 0; ithread < CY_THREADS; ithread++)
 	{
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1426,8 +1426,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1439,22 +1439,22 @@ for(outer=0; outer <= 1; outer++)
 		// on successive calls, so set here at runtime rather than in init-only block:
 		tdat[ithread].arrdat = a;			/* Main data array */
 	#ifdef USE_FGT61
-		ASSERT(HERE, tdat[ithread].brrdat == b, "thread-local memcheck fail!");			/* Modular version of main data array */
+		ASSERT(tdat[ithread].brrdat == b, "thread-local memcheck fail!");			/* Modular version of main data array */
 	#endif
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
-		ASSERT(HERE, ((tmp + 0x20)->d0 == ISRT2 && (tmp + 0x20)->d1 == ISRT2), "thread-local memcheck failed!");
+		ASSERT(((tmp + 0x20)->d0 == ISRT2 && (tmp + 0x20)->d1 == ISRT2), "thread-local memcheck failed!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1464,11 +1464,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			tdat[ithread].bjmodn0 = _bjmodn0[ithread];
 			tdat[ithread].bjmodn1 = _bjmodn1[ithread];
@@ -1510,8 +1510,8 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_SSE2)
 			// This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots:
-			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 			// scale gets set immediately prior to calling carry macro, hence no use checking it here.
 		#endif
 			/* init carries	*/
@@ -1858,7 +1858,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy16_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy16_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1868,7 +1868,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix16_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1957,7 +1957,7 @@ for(outer=0; outer <= 1; outer++)
 
 		for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 		{
-			ASSERT(HERE, CY_THREADS > 1,"");	/* Make sure loop only gets executed if multiple threads */
+			ASSERT(CY_THREADS > 1,"");	/* Make sure loop only gets executed if multiple threads */
 			_cy_r0[ithread] = _cy_r0[ithread-1];
 			_cy_r1[ithread] = _cy_r1[ithread-1];
 			_cy_r2[ithread] = _cy_r2[ithread-1];
@@ -2018,7 +2018,7 @@ for(outer=0; outer <= 1; outer++)
 			// Must use NDIVR instead of p1 here since p1 may have pads which are not applied to element-2-slots-before
 			j1 = NDIVR-2;	j1 += ( (j1 >> DAT_BITS) << PAD_BITS );
 			j2 = j1+RE_IM_STRIDE;
-			ASSERT(HERE, t31 <= 1.0 && t32 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!");
+			ASSERT(t31 <= 1.0 && t32 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!");
 			// Undo the initial dif pass just for the 16 complex terms in question:
 			RADIX_16_DIT(a[j1],a[j2],a[j1+p1 ],a[j2+p1 ],a[j1+p2 ],a[j2+p2 ],a[j1+p3 ],a[j2+p3 ],a[j1+p4 ],a[j2+p4 ],a[j1+p5 ],a[j2+p5 ],a[j1+p6 ],a[j2+p6 ],a[j1+p7 ],a[j2+p7 ],a[j1+p8 ],a[j2+p8 ],a[j1+p9 ],a[j2+p9 ],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15]
 						,a[j1],a[j2],a[j1+p1 ],a[j2+p1 ],a[j1+p2 ],a[j2+p2 ],a[j1+p3 ],a[j2+p3 ],a[j1+p4 ],a[j2+p4 ],a[j1+p5 ],a[j2+p5 ],a[j1+p6 ],a[j2+p6 ],a[j1+p7 ],a[j2+p7 ],a[j1+p8 ],a[j2+p8 ],a[j1+p9 ],a[j2+p9 ],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15]
@@ -2043,11 +2043,11 @@ for(outer=0; outer <= 1; outer++)
 			// Verify that any cyout = 1 has the corresponding high word < 0,
 			// then absorb cyout back into the high word and zero the carry:
 			if(t31 == 1.0) {
-				ASSERT(HERE, a[j1+p15] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!");
+				ASSERT(a[j1+p15] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!");
 				a[j1+p15] += FFT_MUL_BASE;	t31 = 0.0;
 			}
 			if(t32 == 1.0) {
-				ASSERT(HERE, a[j2+p15] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!");
+				ASSERT(a[j2+p15] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!");
 				a[j2+p15] += FFT_MUL_BASE;	t32 = 0.0;
 			}
 			// Redo the initial dif pass just for the 16 complex terms in question:
@@ -2058,7 +2058,7 @@ for(outer=0; outer <= 1; outer++)
 
 		for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 		{
-			ASSERT(HERE, CY_THREADS > 1,"");	/* Make sure loop only gets executed if multiple threads */
+			ASSERT(CY_THREADS > 1,"");	/* Make sure loop only gets executed if multiple threads */
 			_cy_r0[ithread] = _cy_r0[ithread-1];	_cy_i0[ithread] = _cy_i0[ithread-1];
 			_cy_r1[ithread] = _cy_r1[ithread-1];	_cy_i1[ithread] = _cy_i1[ithread-1];
 			_cy_r2[ithread] = _cy_r2[ithread-1];	_cy_i2[ithread] = _cy_i2[ithread-1];
@@ -2115,22 +2115,22 @@ for(outer=0; outer <= 1; outer++)
 	#ifdef USE_FGT61
 		if(!j) {
 			printf("J = 0, wraparound INputs:\n");
-			printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2    ],a[j2    +1], b[j2    ],b[j2    +1]);
-			printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p1 ],a[j2+p1 +1], b[j2+p1 ],b[j2+p1 +1]);
-			printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p2 ],a[j2+p2 +1], b[j2+p2 ],b[j2+p2 +1]);
-			printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p3 ],a[j2+p3 +1], b[j2+p3 ],b[j2+p3 +1]);
-			printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p4 ],a[j2+p4 +1], b[j2+p4 ],b[j2+p4 +1]);
-			printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p5 ],a[j2+p5 +1], b[j2+p5 ],b[j2+p5 +1]);
-			printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p6 ],a[j2+p6 +1], b[j2+p6 ],b[j2+p6 +1]);
-			printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p7 ],a[j2+p7 +1], b[j2+p7 ],b[j2+p7 +1]);
-			printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p8 ],a[j2+p8 +1], b[j2+p8 ],b[j2+p8 +1]);
-			printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p9 ],a[j2+p9 +1], b[j2+p9 ],b[j2+p9 +1]);
-			printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p10],a[j2+p10+1], b[j2+p10],b[j2+p10+1]);
-			printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p11],a[j2+p11+1], b[j2+p11],b[j2+p11+1]);
-			printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p12],a[j2+p12+1], b[j2+p12],b[j2+p12+1]);
-			printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p13],a[j2+p13+1], b[j2+p13],b[j2+p13+1]);
-			printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p14],a[j2+p14+1], b[j2+p14],b[j2+p14+1]);
-			printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p15],a[j2+p15+1], b[j2+p15],b[j2+p15+1]);
+			printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2    ],a[j2    +1], b[j2    ],b[j2    +1]);
+			printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p1 ],a[j2+p1 +1], b[j2+p1 ],b[j2+p1 +1]);
+			printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p2 ],a[j2+p2 +1], b[j2+p2 ],b[j2+p2 +1]);
+			printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p3 ],a[j2+p3 +1], b[j2+p3 ],b[j2+p3 +1]);
+			printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p4 ],a[j2+p4 +1], b[j2+p4 ],b[j2+p4 +1]);
+			printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p5 ],a[j2+p5 +1], b[j2+p5 ],b[j2+p5 +1]);
+			printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p6 ],a[j2+p6 +1], b[j2+p6 ],b[j2+p6 +1]);
+			printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p7 ],a[j2+p7 +1], b[j2+p7 ],b[j2+p7 +1]);
+			printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p8 ],a[j2+p8 +1], b[j2+p8 ],b[j2+p8 +1]);
+			printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p9 ],a[j2+p9 +1], b[j2+p9 ],b[j2+p9 +1]);
+			printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p10],a[j2+p10+1], b[j2+p10],b[j2+p10+1]);
+			printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p11],a[j2+p11+1], b[j2+p11],b[j2+p11+1]);
+			printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p12],a[j2+p12+1], b[j2+p12],b[j2+p12+1]);
+			printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p13],a[j2+p13+1], b[j2+p13],b[j2+p13+1]);
+			printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p14],a[j2+p14+1], b[j2+p14],b[j2+p14+1]);
+			printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p15],a[j2+p15+1], b[j2+p15],b[j2+p15+1]);
 		}
 	#endif
 			a[j2    ] *= radix_inv;
@@ -2169,22 +2169,22 @@ for(outer=0; outer <= 1; outer++)
 			b[j2+p15] = mul_pow2_modq( b[j2+p15], 57);
 		if(j==1) {
 			printf("J = 0, wraparound OUTputs:\n");
-			printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2    -1],a[j2    ], b[j2    -1],b[j2    ]);
-			printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p1 -1],a[j2+p1 ], b[j2+p1 -1],b[j2+p1 ]);
-			printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p2 -1],a[j2+p2 ], b[j2+p2 -1],b[j2+p2 ]);
-			printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p3 -1],a[j2+p3 ], b[j2+p3 -1],b[j2+p3 ]);
-			printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p4 -1],a[j2+p4 ], b[j2+p4 -1],b[j2+p4 ]);
-			printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p5 -1],a[j2+p5 ], b[j2+p5 -1],b[j2+p5 ]);
-			printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p6 -1],a[j2+p6 ], b[j2+p6 -1],b[j2+p6 ]);
-			printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p7 -1],a[j2+p7 ], b[j2+p7 -1],b[j2+p7 ]);
-			printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p8 -1],a[j2+p8 ], b[j2+p8 -1],b[j2+p8 ]);
-			printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p9 -1],a[j2+p9 ], b[j2+p9 -1],b[j2+p9 ]);
-			printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p10-1],a[j2+p10], b[j2+p10-1],b[j2+p10]);
-			printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p11-1],a[j2+p11], b[j2+p11-1],b[j2+p11]);
-			printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p12-1],a[j2+p12], b[j2+p12-1],b[j2+p12]);
-			printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p13-1],a[j2+p13], b[j2+p13-1],b[j2+p13]);
-			printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p14-1],a[j2+p14], b[j2+p14-1],b[j2+p14]);
-			printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p15-1],a[j2+p15], b[j2+p15-1],b[j2+p15]);
+			printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2    -1],a[j2    ], b[j2    -1],b[j2    ]);
+			printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p1 -1],a[j2+p1 ], b[j2+p1 -1],b[j2+p1 ]);
+			printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p2 -1],a[j2+p2 ], b[j2+p2 -1],b[j2+p2 ]);
+			printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p3 -1],a[j2+p3 ], b[j2+p3 -1],b[j2+p3 ]);
+			printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p4 -1],a[j2+p4 ], b[j2+p4 -1],b[j2+p4 ]);
+			printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p5 -1],a[j2+p5 ], b[j2+p5 -1],b[j2+p5 ]);
+			printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p6 -1],a[j2+p6 ], b[j2+p6 -1],b[j2+p6 ]);
+			printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p7 -1],a[j2+p7 ], b[j2+p7 -1],b[j2+p7 ]);
+			printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p8 -1],a[j2+p8 ], b[j2+p8 -1],b[j2+p8 ]);
+			printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p9 -1],a[j2+p9 ], b[j2+p9 -1],b[j2+p9 ]);
+			printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p10-1],a[j2+p10], b[j2+p10-1],b[j2+p10]);
+			printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p11-1],a[j2+p11], b[j2+p11-1],b[j2+p11]);
+			printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p12-1],a[j2+p12], b[j2+p12-1],b[j2+p12]);
+			printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p13-1],a[j2+p13], b[j2+p13-1],b[j2+p13]);
+			printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p14-1],a[j2+p14], b[j2+p14-1],b[j2+p14]);
+			printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p15-1],a[j2+p15], b[j2+p15-1],b[j2+p15]);
 		}
 		#endif
 		}
@@ -2392,7 +2392,7 @@ void radix16_dif_pass1	(double a[],             int n)
 													===============*/
 	/*...Block 1: t1,9,17,25 */
 		jt = j1;		jp = j2;
-		/* Debug: check for overflow of + terms: */	ASSERT(HERE, m1+m9 >= m1 && m2+m10 >= m2,"Overflow of [0,8b] term!");
+		/* Debug: check for overflow of + terms: */	ASSERT(m1+m9 >= m1 && m2+m10 >= m2,"Overflow of [0,8b] term!");
 		rt =t9;	t9 =t1 -rt;	t1 =t1 +rt;				rm =m9;	m9 =qreduce(m1 -rm+q4);	m1 =qreduce(m1 +rm   );	//  1, 2 in   0,8b -> 0,b
 		it =t10;t10=t2 -it;	t2 =t2 +it;				im =m10;m10=qreduce(m2 -im+q4);	m2 =qreduce(m2 +im+q4);	//  9,10 in -4b,4b -> 0,b
 
@@ -2784,10 +2784,10 @@ void radix16_dit_pass1	(double a[],             int n)
 	/*...Block 1: t1,9,17,25	*/
 /*
 printf("Block 1 float/int inputs:\n");
-printf("1 ,2  float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t1 ,t2 , m1 ,m2 , q-qreduce_full(m1 ),q-qreduce_full(m2 ));
-printf("9 ,10 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t9 ,t10, m9 ,m10, q-qreduce_full(m9 ),q-qreduce_full(m10));
-printf("17,18 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t17,t18, m17,m18, q-qreduce_full(m17),q-qreduce_full(m18));
-printf("25,26 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t25,t26, m25,m26, q-qreduce_full(m25),q-qreduce_full(m26));
+printf("1 ,2  float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t1 ,t2 , m1 ,m2 , q-qreduce_full(m1 ),q-qreduce_full(m2 ));
+printf("9 ,10 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t9 ,t10, m9 ,m10, q-qreduce_full(m9 ),q-qreduce_full(m10));
+printf("17,18 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t17,t18, m17,m18, q-qreduce_full(m17),q-qreduce_full(m18));
+printf("25,26 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t25,t26, m25,m26, q-qreduce_full(m25),q-qreduce_full(m26));
 */
 		rt =t9 ;	t9 =t1 -rt;	t1 =t1 +rt;			rm =m9 ;	m9 =qreduce(m1 -rm+q4);	m1 =qreduce(m1 +rm);	// +:   0,8b -> 0,b
 		it =t10;	t10=t2 -it;	t2 =t2 +it;			im =m10;	m10=qreduce(m2 -im+q4);	m2 =qreduce(m2 +im);	// -: -4b,4b -> 0,b
@@ -2804,10 +2804,10 @@ printf("25,26 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t
 	/*...Block 3: t5,13,21,29	*/
 /*
 printf("Block 3 float/int inputs:\n");
-printf("5 ,6  float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t5 ,t6 , m5 ,m6 , q-qreduce_full(m5 ),q-qreduce_full(m6 ));
-printf("13,14 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t13,t14, m13,m14, q-qreduce_full(m13),q-qreduce_full(m14));
-printf("21,22 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t21,t22, m21,m22, q-qreduce_full(m21),q-qreduce_full(m22));
-printf("29,30 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t29,t30, m29,m30, q-qreduce_full(m29),q-qreduce_full(m30));
+printf("5 ,6  float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t5 ,t6 , m5 ,m6 , q-qreduce_full(m5 ),q-qreduce_full(m6 ));
+printf("13,14 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t13,t14, m13,m14, q-qreduce_full(m13),q-qreduce_full(m14));
+printf("21,22 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t21,t22, m21,m22, q-qreduce_full(m21),q-qreduce_full(m22));
+printf("29,30 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t29,t30, m29,m30, q-qreduce_full(m29),q-qreduce_full(m30));
 */
 		rt =t13;	t13=t5 -t14;	t5 =t5 +t14;	rm =m13;m13=qreduce(m5-m14+q4);	m5 =qreduce(m5 +m14+q4);	// all 4 outs in -4b,4b;
 					t14=t6 +rt;		t6 =t6 -rt;				m14=qreduce(m6+rm +q4);	m6 =qreduce(m6 -rm +q4);	// reduce all 4 to 0,b.
@@ -2826,10 +2826,10 @@ t21=rt;	rt =(t29-t30)*ISRT2;it =(t29+t30)*ISRT2;	rm = mul_i2(m29-m30+q4);	im = m
 	/*...Block 2: t3,11,19,27	*/
 /*
 printf("Block 2 float/int inputs:\n");
-printf("3 ,4  float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t3 ,t4 , m3 ,m4 , q-qreduce_full(m3 ),q-qreduce_full(m4 ));
-printf("11,12 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t11,t12, m11,m12, q-qreduce_full(m11),q-qreduce_full(m12));
-printf("19,20 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t19,t20, m19,m20, q-qreduce_full(m19),q-qreduce_full(m20));
-printf("27,28 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t27,t28, m27,m28, q-qreduce_full(m27),q-qreduce_full(m28));
+printf("3 ,4  float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t3 ,t4 , m3 ,m4 , q-qreduce_full(m3 ),q-qreduce_full(m4 ));
+printf("11,12 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t11,t12, m11,m12, q-qreduce_full(m11),q-qreduce_full(m12));
+printf("19,20 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t19,t20, m19,m20, q-qreduce_full(m19),q-qreduce_full(m20));
+printf("27,28 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t27,t28, m27,m28, q-qreduce_full(m27),q-qreduce_full(m28));
 */
 		rt =(t12+t11)*ISRT2;it =(t12-t11)*ISRT2;	rm = mul_i2(m12+m11+q4);im = mul_i2(m12-m11+q4);	// 0,b30
 		t11 = t3 -rt;		t3 = t3 +rt;			m11 = m3 -rm;		m3 = m3 +rm;	//  3, 4 in -2b,2b+b30
@@ -2856,10 +2856,10 @@ t19=rt;	rt =t27*s + t28*c;	it =t28*s - t27*c;		cmul_modq8(m27,m28, sm,q8-cm, &rm
 	/*...Block 4: t7,15,23,31	*/
 /*
 printf("Block 4 float/int inputs:\n");
-printf(" 7, 8 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t7 ,t8 , m7 ,m8 , q-qreduce_full(m7 ),q-qreduce_full(m8 ));
-printf("15,16 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t15,t16, m15,m16, q-qreduce_full(m15),q-qreduce_full(m16));
-printf("23,24 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t23,t24, m23,m24, q-qreduce_full(m23),q-qreduce_full(m24));
-printf("31,32 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t31,t32, m31,m32, q-qreduce_full(m31),q-qreduce_full(m32));
+printf(" 7, 8 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t7 ,t8 , m7 ,m8 , q-qreduce_full(m7 ),q-qreduce_full(m8 ));
+printf("15,16 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t15,t16, m15,m16, q-qreduce_full(m15),q-qreduce_full(m16));
+printf("23,24 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t23,t24, m23,m24, q-qreduce_full(m23),q-qreduce_full(m24));
+printf("31,32 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t31,t32, m31,m32, q-qreduce_full(m31),q-qreduce_full(m32));
 exit(0);
 */
 		rt =(t15-t16)*ISRT2;it =(t15+t16)*ISRT2;	rm = mul_i2(m15-m16+q4);im = mul_i2(m15+m16+q4);	// 0,b30
@@ -3143,8 +3143,8 @@ t23=rt;	rt =t31*c + t32*s;	it =t32*c - t31*s;		cmul_modq8(m31,m32, cm,q8-sm, &rm
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -3251,9 +3251,9 @@ t23=rt;	rt =t31*c + t32*s;	it =t32*c - t31*s;		cmul_modq8(m31,m32, cm,q8-sm, &rm
 		half_arr= tmp + 0x12;	/* This table needs 20x16 bytes */
 	  #endif
 
-		ASSERT(HERE, (isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!");
+		ASSERT((isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -3263,18 +3263,18 @@ t23=rt;	rt =t31*c + t32*s;	it =t32*c - t31*s;		cmul_modq8(m31,m32, cm,q8-sm, &rm
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#else	// SSE2:
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		} else {
 		#ifdef USE_AVX512
 			/* No-Op */
 		#else
-			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		}
 
@@ -3742,8 +3742,8 @@ t23=rt;	rt =t31*c + t32*s;	it =t32*c - t31*s;		cmul_modq8(m31,m32, cm,q8-sm, &rm
 		/* Now, finally can update fx and cy: */\
 		*cy   = DNINT(temp*baseinv[i]);	check_nint(*cy, temp*baseinv[i]);/*@*/
 		*fx  = (temp-*cy * base[i])*wt;/*@*/
-	ASSERT(HERE, *fx == (double)rm * wt, "Bad mod-Xout!");	/* put rm into double-version and forward weight *//*@*/
-	ASSERT(HERE, itmp == *cy, "Bad mod-carry!");
+	ASSERT(*fx == (double)rm * wt, "Bad mod-Xout!");	/* put rm into double-version and forward weight *//*@*/
+	ASSERT(itmp == *cy, "Bad mod-carry!");
 	/*========================*//*@*/
 		bjmodn = (bjmodn + bw) & nm1;/*@*/
 			wt   =wtlp1*wtA;/*@*/
diff --git a/src/radix16_dyadic_square.c b/src/radix16_dyadic_square.c
index ef0572e0..38422dee 100755
--- a/src/radix16_dyadic_square.c
+++ b/src/radix16_dyadic_square.c
@@ -154,7 +154,7 @@ void radix16_dyadic_square(
 		b = (double *)(fwd_fft_only & ~0xCull);
 		// BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0:
 		if(fwd_fft_only & 0xC) {
-			ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
+			ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
 			fwd_fft_only = 3ull;
 		}
 	}
@@ -172,9 +172,9 @@ void radix16_dyadic_square(
 /**************************************************************************************************************************************/
 	if((rad0save != radix0) || (nsave != n))
 	{
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		nsave = n;
-		ASSERT(HERE, N2 == n/2, "N2 bad!");
+		ASSERT(N2 == n/2, "N2 bad!");
 		rad0save = radix0;
 		ndivrad0 = n/radix0;
 		for(j = 0; j < ndivrad0; j += stride)
@@ -182,7 +182,7 @@ void radix16_dyadic_square(
 			j1 = j + ( (j >> DAT_BITS) << PAD_BITS );
 			if( (j1+stridh) != (j+stridh) + ( ((j+stridh) >> DAT_BITS) << PAD_BITS ) ) {
 				printf("j, j1, stride/2 = %d,%d,%d, jpad = %d\n",j,j1, stridh, (j+stridh) + (((j+stridh) >> DAT_BITS) << PAD_BITS) );
-				ASSERT(HERE, 0 , "add1 calculation violates padded index rules!");
+				ASSERT(0 , "add1 calculation violates padded index rules!");
 			}
 		}
 		if(index_ptmp0) {
@@ -195,7 +195,7 @@ void radix16_dyadic_square(
 
 		index_ptmp = ALLOC_INT(N2/16);
 		index = ALIGN_INT(index_ptmp);
-		if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		for(i=0; i < N2/16; i++)
 		{
 			index[i]=i;
@@ -205,11 +205,11 @@ void radix16_dyadic_square(
 		index1_mod = (n>>5)/radix0;	/* complex length requires an additional divide by 2 */
 
 		index_ptmp0 = ALLOC_INT(index_ptmp0, index0_mod);
-		if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		index0 = ALIGN_INT(index_ptmp0);
 
 		index_ptmp1 = ALLOC_INT(index_ptmp1, index1_mod);
-		if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		index1 = ALIGN_INT(index_ptmp1);
 
 		for(i=0; i < index0_mod; i++){index0[i]=       i;}
@@ -228,7 +228,7 @@ void radix16_dyadic_square(
 			if(i == radix0)
 				break;
 		}
-		if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 		bit_reverse_int(index0, index0_mod,                 nradices_prim_radix0, &radix_prim[nradices_prim_radix0-1], -1,(int *)arr_scratch);
 		bit_reverse_int(index1, index1_mod, nradices_prim-4-nradices_prim_radix0, &radix_prim[nradices_prim       -5], -1,(int *)arr_scratch);
@@ -243,10 +243,10 @@ void radix16_dyadic_square(
 		if(init_sse2 <= max_threads)	// current alloc sufficient
 			return;
 
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
 
 	#ifdef USE_SSE2
@@ -256,13 +256,13 @@ void radix16_dyadic_square(
 			free((void *)sc_arr);	sc_arr=0x0;
 		}
 		// Index vectors used in SIMD roots-computation.
-		sm_arr = ALLOC_INT(sm_arr, max_threads*10*RE_IM_STRIDE + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sm_arr = ALLOC_INT(sm_arr, max_threads*10*RE_IM_STRIDE + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sm_ptr = ALIGN_INT(sm_arr);
-		ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 		// Twiddles-array:
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads + 100);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads + 100);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots,
 	last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array.
@@ -328,7 +328,7 @@ void radix16_dyadic_square(
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
   #ifdef USE_SSE2
 	k1_arr =   __i0 + thr_id*10*RE_IM_STRIDE;
 	k2_arr = k1_arr + 5*RE_IM_STRIDE;
@@ -357,9 +357,9 @@ void radix16_dyadic_square(
 #endif
 
 	/*...If a new runlength, should not get to this point: */
-	ASSERT(HERE, n == nsave,"n != nsave");
-	ASSERT(HERE, incr == 32,"incr != 32");
-	ASSERT(HERE, ndivrad0 == n/radix0,"bad value for ndivrad0!");
+	ASSERT(n == nsave,"n != nsave");
+	ASSERT(incr == 32,"incr != 32");
+	ASSERT(ndivrad0 == n/radix0,"bad value for ndivrad0!");
 	/*
 	k = ii*(ndivrad0 >> 5);
 	*/
diff --git a/src/radix16_main_carry_loop.h b/src/radix16_main_carry_loop.h
index 26609938..61749fcf 100755
--- a/src/radix16_main_carry_loop.h
+++ b/src/radix16_main_carry_loop.h
@@ -800,22 +800,22 @@ t23=rt;	rt =t31*c + t32*s;	it =t32*c - t31*s;		cmul_modq8(m31,m32, cm,q8-sm, &rm
 
 if(!j) {
 	printf("J = 0, carry-step INputs:\n");
-	printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a1p0r,a1p0i, b1p0r,b1p0i);
-	printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a1p1r,a1p1i, b1p1r,b1p1i);
-	printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a1p2r,a1p2i, b1p2r,b1p2i);
-	printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a1p3r,a1p3i, b1p3r,b1p3i);
-	printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a1p4r,a1p4i, b1p4r,b1p4i);
-	printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a1p5r,a1p5i, b1p5r,b1p5i);
-	printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a1p6r,a1p6i, b1p6r,b1p6i);
-	printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a1p7r,a1p7i, b1p7r,b1p7i);
-	printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a1p8r,a1p8i, b1p8r,b1p8i);
-	printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a1p9r,a1p9i, b1p9r,b1p9i);
-	printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a1pAr,a1pAi, b1pAr,b1pAi);
-	printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a1pBr,a1pBi, b1pBr,b1pBi);
-	printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a1pCr,a1pCi, b1pCr,b1pCi);
-	printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a1pDr,a1pDi, b1pDr,b1pDi);
-	printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a1pEr,a1pEi, b1pEr,b1pEi);
-	printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a1pFr,a1pFi, b1pFr,b1pFi);
+	printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p0r,a1p0i, b1p0r,b1p0i);
+	printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p1r,a1p1i, b1p1r,b1p1i);
+	printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p2r,a1p2i, b1p2r,b1p2i);
+	printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p3r,a1p3i, b1p3r,b1p3i);
+	printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p4r,a1p4i, b1p4r,b1p4i);
+	printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p5r,a1p5i, b1p5r,b1p5i);
+	printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p6r,a1p6i, b1p6r,b1p6i);
+	printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p7r,a1p7i, b1p7r,b1p7i);
+	printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p8r,a1p8i, b1p8r,b1p8i);
+	printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p9r,a1p9i, b1p9r,b1p9i);
+	printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pAr,a1pAi, b1pAr,b1pAi);
+	printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pBr,a1pBi, b1pBr,b1pBi);
+	printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pCr,a1pCi, b1pCr,b1pCi);
+	printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pDr,a1pDi, b1pDr,b1pDi);
+	printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pEr,a1pEi, b1pEr,b1pEi);
+	printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pFr,a1pFi, b1pFr,b1pFi);
 }
 if(!j) {
 	if(full_pass)printf("\n");
@@ -864,22 +864,22 @@ if(!j)
 */
 if(!j) {
 	printf("J = 0, carry-step OUTputs:\n");
-	printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a1p0r,a1p0i, b1p0r,b1p0i);
-	printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a1p1r,a1p1i, b1p1r,b1p1i);
-	printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a1p2r,a1p2i, b1p2r,b1p2i);
-	printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a1p3r,a1p3i, b1p3r,b1p3i);
-	printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a1p4r,a1p4i, b1p4r,b1p4i);
-	printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a1p5r,a1p5i, b1p5r,b1p5i);
-	printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a1p6r,a1p6i, b1p6r,b1p6i);
-	printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a1p7r,a1p7i, b1p7r,b1p7i);
-	printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a1p8r,a1p8i, b1p8r,b1p8i);
-	printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a1p9r,a1p9i, b1p9r,b1p9i);
-	printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a1pAr,a1pAi, b1pAr,b1pAi);
-	printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a1pBr,a1pBi, b1pBr,b1pBi);
-	printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a1pCr,a1pCi, b1pCr,b1pCi);
-	printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a1pDr,a1pDi, b1pDr,b1pDi);
-	printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a1pEr,a1pEi, b1pEr,b1pEi);
-	printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a1pFr,a1pFi, b1pFr,b1pFi);
+	printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p0r,a1p0i, b1p0r,b1p0i);
+	printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p1r,a1p1i, b1p1r,b1p1i);
+	printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p2r,a1p2i, b1p2r,b1p2i);
+	printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p3r,a1p3i, b1p3r,b1p3i);
+	printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p4r,a1p4i, b1p4r,b1p4i);
+	printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p5r,a1p5i, b1p5r,b1p5i);
+	printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p6r,a1p6i, b1p6r,b1p6i);
+	printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p7r,a1p7i, b1p7r,b1p7i);
+	printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p8r,a1p8i, b1p8r,b1p8i);
+	printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p9r,a1p9i, b1p9r,b1p9i);
+	printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pAr,a1pAi, b1pAr,b1pAi);
+	printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pBr,a1pBi, b1pBr,b1pBi);
+	printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pCr,a1pCi, b1pCr,b1pCi);
+	printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pDr,a1pDi, b1pDr,b1pDi);
+	printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pEr,a1pEi, b1pEr,b1pEi);
+	printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pFr,a1pFi, b1pFr,b1pFi);
 
 	printf("\niter %2u [full-pass = %u]: a01 OUT: %20.10e, %20.10e, cy = %20.10e\n",iter,full_pass,a1p0r,a1p0i,cy_r0);
 }
@@ -1309,7 +1309,7 @@ if(!j) {
 													===============*/
 	/*...Block 1: t1,9,17,25 */
 		jt = j1;		jp = j2;
-		/* Debug: check for overflow of + terms: */	ASSERT(HERE, m1+m9 >= m1 && m$+m10 >= m$,"Overflow of [0,8b] term!");
+		/* Debug: check for overflow of + terms: */	ASSERT(m1+m9 >= m1 && m$+m10 >= m$,"Overflow of [0,8b] term!");
 		rt =t9;	t9 =t1 -rt;	t1 =t1 +rt;				rm =m9;	m9 =qreduce(m1 -rm+q4);	m1 =qreduce(m1 +rm   );	// +:   0,8b -> 0,b
 		it =t10;t10=t2 -it;	t2 =t2 +it;				im =m10;m10=qreduce(m$ -im+q4);	m$ =qreduce(m$ +im   );	// -: -4b,4b -> 0,b
 
@@ -1376,22 +1376,22 @@ t23=rt;	rt =t31*c - t32*s;	it =t32*c + t31*s;		cmul_modq8(m31,m32, cm,sm,  &rm,
 
 if(!j) {
 	printf("J = 0, DIF1 OUTputs:\n");
-	printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1    ],a[j1    +1], b[j1    ],b[j1    +1]);
-	printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p1 ],a[j1+p1 +1], b[j1+p1 ],b[j1+p1 +1]);
-	printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p2 ],a[j1+p2 +1], b[j1+p2 ],b[j1+p2 +1]);
-	printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p3 ],a[j1+p3 +1], b[j1+p3 ],b[j1+p3 +1]);
-	printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p4 ],a[j1+p4 +1], b[j1+p4 ],b[j1+p4 +1]);
-	printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p5 ],a[j1+p5 +1], b[j1+p5 ],b[j1+p5 +1]);
-	printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p6 ],a[j1+p6 +1], b[j1+p6 ],b[j1+p6 +1]);
-	printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p7 ],a[j1+p7 +1], b[j1+p7 ],b[j1+p7 +1]);
-	printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p8 ],a[j1+p8 +1], b[j1+p8 ],b[j1+p8 +1]);
-	printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p9 ],a[j1+p9 +1], b[j1+p9 ],b[j1+p9 +1]);
-	printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p10],a[j1+p10+1], b[j1+p10],b[j1+p10+1]);
-	printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p11],a[j1+p11+1], b[j1+p11],b[j1+p11+1]);
-	printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p12],a[j1+p12+1], b[j1+p12],b[j1+p12+1]);
-	printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p13],a[j1+p13+1], b[j1+p13],b[j1+p13+1]);
-	printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p14],a[j1+p14+1], b[j1+p14],b[j1+p14+1]);
-	printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p15],a[j1+p15+1], b[j1+p15],b[j1+p15+1]);
+	printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1    ],a[j1    +1], b[j1    ],b[j1    +1]);
+	printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p1 ],a[j1+p1 +1], b[j1+p1 ],b[j1+p1 +1]);
+	printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p2 ],a[j1+p2 +1], b[j1+p2 ],b[j1+p2 +1]);
+	printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p3 ],a[j1+p3 +1], b[j1+p3 ],b[j1+p3 +1]);
+	printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p4 ],a[j1+p4 +1], b[j1+p4 ],b[j1+p4 +1]);
+	printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p5 ],a[j1+p5 +1], b[j1+p5 ],b[j1+p5 +1]);
+	printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p6 ],a[j1+p6 +1], b[j1+p6 ],b[j1+p6 +1]);
+	printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p7 ],a[j1+p7 +1], b[j1+p7 ],b[j1+p7 +1]);
+	printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p8 ],a[j1+p8 +1], b[j1+p8 ],b[j1+p8 +1]);
+	printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p9 ],a[j1+p9 +1], b[j1+p9 ],b[j1+p9 +1]);
+	printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p10],a[j1+p10+1], b[j1+p10],b[j1+p10+1]);
+	printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p11],a[j1+p11+1], b[j1+p11],b[j1+p11+1]);
+	printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p12],a[j1+p12+1], b[j1+p12],b[j1+p12+1]);
+	printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p13],a[j1+p13+1], b[j1+p13],b[j1+p13+1]);
+	printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p14],a[j1+p14+1], b[j1+p14],b[j1+p14+1]);
+	printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p15],a[j1+p15+1], b[j1+p15],b[j1+p15+1]);
 }
 			/**********************************************/
 	#else	// USE_FGT61 = False; Basic scalar-double mode:
diff --git a/src/radix16_pairFFT_mul.c b/src/radix16_pairFFT_mul.c
index bbe821db..67793baa 100755
--- a/src/radix16_pairFFT_mul.c
+++ b/src/radix16_pairFFT_mul.c
@@ -247,7 +247,7 @@ void radix16_pairFFT_mul(
 	if(INIT_ARRAYS)
 	{
 		nsave = n;
-		ASSERT(HERE, N2 == n/2, "N2 bad!");
+		ASSERT(N2 == n/2, "N2 bad!");
 
 	#if SYMM == 2	// Use complex-plane symmetries to reduce fraction of rt1 array actually needed
 		nh = n/(NRT<<2);	// #rt1 elts in each quadrant
@@ -282,7 +282,7 @@ void radix16_pairFFT_mul(
 			free((void *)index_ptmp);	index_ptmp=0x0;
 		}
 		index_ptmp = ALLOC_INT(index_ptmp, N2/16);
-		ASSERT(HERE, index_ptmp != 0,"ERROR: unable to allocate array INDEX!");
+		ASSERT(index_ptmp != 0,"ERROR: unable to allocate array INDEX!");
 		index = ALIGN_INT(index_ptmp);
 	/*
 	!...Now rearrange FFT sincos indices using the main loop structure as a template.
@@ -321,7 +321,7 @@ void radix16_pairFFT_mul(
 		  if(j2_start == n-32)break;
 
 		  blocklen_sum = blocklen_sum + blocklen;
-		  ASSERT(HERE, i != 0,"ERROR 10!");
+		  ASSERT(i != 0,"ERROR 10!");
 		  blocklen = (radix_prim[i-1]-1)*blocklen_sum;
 
 		  j2_start = j2_start+(blocklen<<2);
@@ -342,14 +342,14 @@ void radix16_pairFFT_mul(
 	/*...If a new runlength, should not get to this point: */
 	if(n != nsave) {
 		sprintf(cbuf,"ERROR: %s: INIT_ARRAYS not invoked for new runlength!",func);
-		ASSERT(HERE, 0,cbuf);
+		ASSERT(0,cbuf);
 	}
 
 	/* If precomputing a forward FFT of a set of inputs, make sure
 	they're in the uv-vector and the abcd-multiplier vectors are null: */
 	if(FORWARD_FFT_ONLY == 1 && (ab_mul != 0x0 || cd_mul != 0x0)) {
 		sprintf(cbuf,"%s: FORWARD_FFT_ONLY = TRUE but non-null abcd-multiplier vectors!",func);
-		ASSERT(HERE, 0,cbuf);
+		ASSERT(0,cbuf);
 	}
 
 /* Init the loop-control variables: */
@@ -1179,7 +1179,7 @@ for(i = nradices_prim-5; i >= 0; i-- )	/* Main loop: lower bound = nradices_prim
 			// Dec 2015: Despite all my efforts, simply not yet able to wring out remaining bug(s) in indexing scheme
 			// here. If and when I do finally get things working, also need to fuse the 2 x PAIR_MUL occurrences on
 			// each line into a working single ABCD_MUL macro, which avoids the work-duplication of the 2 x PAIR_MUL:
-			ASSERT(HERE, 0, "Linear-combo algorithm not yet working!");
+			ASSERT(0, "Linear-combo algorithm not yet working!");
 			/*
 			Dyadic muls of the forward FFT outputs with the corresponding a/b and c/d-vector data so as to
 			obtain FFT(a*u-b*v, c*u-d*v). u,v in ajp*r,i; a,b in ab_mul[even,odd]; c,d in cd_mul[even,odd]:
diff --git a/src/radix16_wrapper_ini.c b/src/radix16_wrapper_ini.c
index b37fd430..31285043 100755
--- a/src/radix16_wrapper_ini.c
+++ b/src/radix16_wrapper_ini.c
@@ -75,7 +75,7 @@ void radix16_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int r
 				ws_m           [iblock_next] = m           ;
 				ws_blocklen    [iblock_next] = blocklen    ;
 				ws_blocklen_sum[iblock_next] = blocklen_sum;
-			//	printf("%8llu  %20llu  %8llu: init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
+			//	printf("%8" PRIu64 "  %20" PRIu64 "  %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
 				return;
 			}
 	jump_in:	// Entry point for all blocks but the first.
diff --git a/src/radix16_wrapper_square.c b/src/radix16_wrapper_square.c
index 296b3819..ad9f042b 100755
--- a/src/radix16_wrapper_square.c
+++ b/src/radix16_wrapper_square.c
@@ -201,7 +201,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		b = (double *)(fwd_fft_only & ~0xCull);
 		// BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0:
 		if(fwd_fft_only & 0xC) {
-			ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
+			ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
 			fwd_fft_only = 3ull;
 		}
 	}
@@ -220,10 +220,10 @@ The scratch array (2nd input argument) is only needed for data table initializat
 		nsave = n;
 		if(init_sse2 > max_threads)	// current SIMD local-alloc insufficient
 		{
-			ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+			ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			max_threads = init_sse2;
 		#ifndef COMPILER_TYPE_GCC
-			ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+			ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 		#endif
 
 		#ifdef USE_SSE2
@@ -235,14 +235,14 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			// Index vectors used in SIMD roots-computation.
 			// The AVX512 compute-sincos-mults code needs 2 elements per complex-double-load, so use 10*RE_IM_STRIDE per array
 			// to alloc storage here for all cases, even though that leaves upper array halves unused for sub-AVX512.
-			sm_arr = ALLOC_INT(sm_arr, max_threads*20*RE_IM_STRIDE + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sm_arr = ALLOC_INT(sm_arr, max_threads*20*RE_IM_STRIDE + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sm_ptr = ALIGN_INT(sm_arr);
-			ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+			ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 			// Twiddles-array: Need 0x47 slots for data, plus need to leave room to pad-align.
 			// v20: To support inline a*(b-c) for p-1 stage 2, need 2*RADIX = 32 added vec_dbl, thus 0x4c ==> 0x6c:
-			sc_arr = ALLOC_VEC_DBL(sc_arr, 0x6c*max_threads);	ASSERT(HERE, sc_arr != 0,"ERROR: unable to allocate sc_arr!");
+			sc_arr = ALLOC_VEC_DBL(sc_arr, 0x6c*max_threads);	ASSERT(sc_arr != 0,"ERROR: unable to allocate sc_arr!");
 			sc_ptr = ALIGN_VEC_DBL(sc_arr);
-			ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			/* Use low 32 16-byte slots of sc_arr for temporaries, next 4 for const = 1/4 and nontrivial complex 16th roots,
 			last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array: */
 		  #ifdef MULTITHREAD
@@ -392,12 +392,12 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			free((void *)twidl_ptmp);	twidl_ptmp = 0x0;
 		#endif
 		}
-		index_ptmp = ALLOC_INT(index_ptmp, N2/16);	ASSERT(HERE, index_ptmp != 0,"ERROR: unable to allocate array INDEX!");
+		index_ptmp = ALLOC_INT(index_ptmp, N2/16);	ASSERT(index_ptmp != 0,"ERROR: unable to allocate array INDEX!");
 		index = ALIGN_INT(index_ptmp);
 	#ifdef USE_PRECOMPUTED_TWIDDLES
 	printf("%s: Alloc precomputed-twiddles array with %u Kdoubles.\n",func,N2*15/8);
-		twidl_ptmp = ALLOC_COMPLEX(twidl_ptmp, N2*15/16);	ASSERT(HERE, twidl_ptmp != 0,"ERROR: unable to allocate twidl_ptmp!");
-		twidl = ALIGN_COMPLEX(twidl_ptmp);	ASSERT(HERE, ((long)twidl & 0x3f) == 0, "twidl-array not 64-byte aligned!");
+		twidl_ptmp = ALLOC_COMPLEX(twidl_ptmp, N2*15/16);	ASSERT(twidl_ptmp != 0,"ERROR: unable to allocate twidl_ptmp!");
+		twidl = ALIGN_COMPLEX(twidl_ptmp);	ASSERT(((long)twidl & 0x3f) == 0, "twidl-array not 64-byte aligned!");
 	#endif
 	/*
 	!...Now rearrange FFT sincos indices using the main loop structure as a template.
@@ -427,7 +427,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 			k1 = k1 + (blocklen >> 1);
 			if(j2_start == n-32)break;
 			blocklen_sum = blocklen_sum + blocklen;
-			ASSERT(HERE, i != 0,"ERROR 10!");
+			ASSERT(i != 0,"ERROR 10!");
 			blocklen = (radix_prim[i-1]-1)*blocklen_sum;
 			j2_start = j2_start+(blocklen<<2);
 		}
@@ -1066,7 +1066,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
   #ifdef USE_SSE2
 	k1_arr = __i0 + thr_id*20*RE_IM_STRIDE;
 	k2_arr = k1_arr +      10*RE_IM_STRIDE;
@@ -1098,7 +1098,7 @@ The scratch array (2nd input argument) is only needed for data table initializat
   #endif
 #endif
 	/*...If a new runlength, should not get to this point: */
-	ASSERT(HERE, n == nsave,"n != nsave");
+	ASSERT(n == nsave,"n != nsave");
 
 /*
 !   SOLVING THE CACHE FLOW PROBLEM FOR BIT-REVERSED ARRAY DATA:
diff --git a/src/radix176_ditN_cy_dif1.c b/src/radix176_ditN_cy_dif1.c
index efa7da0f..0a32d5f5 100755
--- a/src/radix176_ditN_cy_dif1.c
+++ b/src/radix176_ditN_cy_dif1.c
@@ -368,7 +368,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -438,11 +438,11 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -480,7 +480,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -490,7 +490,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -530,24 +530,24 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix176_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix176_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix176_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -1099,12 +1099,12 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1128,7 +1128,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1281,8 +1281,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1292,8 +1292,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1302,26 +1302,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1451,7 +1451,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy176_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy176_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1461,7 +1461,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2210,8 +2210,8 @@ void radix176_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2341,21 +2341,21 @@ void radix176_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix192_ditN_cy_dif1.c b/src/radix192_ditN_cy_dif1.c
index ac366f69..acedd916 100755
--- a/src/radix192_ditN_cy_dif1.c
+++ b/src/radix192_ditN_cy_dif1.c
@@ -306,7 +306,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -364,7 +364,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		i = 1;
 	  #endif
 		if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) {	// Only care about this divisibility property for LOACC carry modes:
-			ASSERT(HERE, 0 == ((RADIX/i) % incr),"Carry-chain wts-multipliers recurrence length must divide RADIX/[n-wayness of carry macro]!");
+			ASSERT(0 == ((RADIX/i) % incr),"Carry-chain wts-multipliers recurrence length must divide RADIX/[n-wayness of carry macro]!");
 		}
 		// For n a power of 2 don't need to worry about 32-bit integer overflow in the sw*NDIVR term,
 		// but for non-power-of-2 n we must cast-to-uint64 to avoid such overflows fubaring the result:
@@ -376,11 +376,11 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -418,7 +418,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -428,7 +428,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -468,24 +468,24 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix192_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix192_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix192_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -515,8 +515,8 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x364; This is where the value of half_arr_offset192 comes from
 		half_arr= tmp + 0x02;	// This table needs 32*SZ_VD bytes in sse2 mode
 	  #endif
-//		ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix192_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix192_creals_in_local_store checksum failed!");
+//		ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix192_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix192_creals_in_local_store checksum failed!");
 
 		/* Roots of 1 for radix-3 DFTs: cc0 = (cc1+cc2+cc3)/3 - 1; subtract 1 from Nussbaumer's definition in order to ease in-place computation */
 		VEC_DBL_INIT(cc0, c3m1);
@@ -1074,12 +1074,12 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1103,7 +1103,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1255,8 +1255,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1266,8 +1266,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1276,26 +1276,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1425,7 +1425,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy192_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy192_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1435,7 +1435,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2323,8 +2323,8 @@ void radix192_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2588,21 +2588,21 @@ void radix192_dit_pass1(double a[], int n)
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x364; This is where the value of half_arr_offset192 comes from
 		half_arr= tmp + 0x02;	// This table needs 20*SZ_VD bytes in sse2 mode
 	  #endif
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix208_ditN_cy_dif1.c b/src/radix208_ditN_cy_dif1.c
index 7dfc3653..fe506677 100755
--- a/src/radix208_ditN_cy_dif1.c
+++ b/src/radix208_ditN_cy_dif1.c
@@ -336,7 +336,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -404,11 +404,11 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -446,7 +446,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -456,7 +456,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -496,24 +496,24 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix208_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix208_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -548,7 +548,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x3c4; This is where the value of half_arr_offset208 comes from
 		half_arr= tmp + 0x02;	// This table needs 32*SZ_VD bytes in sse2 mode
 	  #endif
-		ASSERT(HERE, (radix208_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!");
+		ASSERT((radix208_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
 	  #if 1
@@ -971,12 +971,12 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1000,7 +1000,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1152,8 +1152,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1163,8 +1163,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1173,26 +1173,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1322,7 +1322,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy208_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy208_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1332,7 +1332,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2017,8 +2017,8 @@ void radix208_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2109,21 +2109,21 @@ void radix208_dit_pass1(double a[], int n)
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x3c4; This is where the value of half_arr_offset208 comes from
 		half_arr= tmp + 0x02;	// This table needs 20*SZ_VD bytes in sse2 mode
 	  #endif
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix20_ditN_cy_dif1.c b/src/radix20_ditN_cy_dif1.c
index 0cdaaa8e..e10c5ff3 100755
--- a/src/radix20_ditN_cy_dif1.c
+++ b/src/radix20_ditN_cy_dif1.c
@@ -279,7 +279,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "radix20_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "radix20_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -346,11 +346,11 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -389,7 +389,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -399,7 +399,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -435,18 +435,18 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix20_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix20_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 80 16-byte slots of sc_arr for temporaries, next 5 for the nontrivial complex 16th roots,
 	next 10 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -914,12 +914,12 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		_cy18	= (double *)malloc(j);	ptr_prod += (uint32)(_cy18== 0x0);
 		_cy19	= (double *)malloc(j);	ptr_prod += (uint32)(_cy19== 0x0);
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix20_ditN_cy_dif1.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix20_ditN_cy_dif1.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/20-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix20_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix20_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -943,7 +943,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		first_entry=FALSE;
 	}	/* endif(first_entry) */
@@ -1113,8 +1113,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1124,8 +1124,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1134,20 +1134,20 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1367,7 +1367,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy20_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy20_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1377,7 +1377,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix32_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1447,7 +1447,7 @@ for(outer=0; outer <= 1; outer++)
 
 	for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 	{
-		ASSERT(HERE, CY_THREADS > 1,"radix20_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
+		ASSERT(CY_THREADS > 1,"radix20_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
 		_cy00[ithread] = _cy00[ithread-1];
 		_cy01[ithread] = _cy01[ithread-1];
 		_cy02[ithread] = _cy02[ithread-1];
@@ -1882,8 +1882,8 @@ void radix20_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -1952,17 +1952,17 @@ void radix20_dit_pass1(double a[], int n)
 								s1p18r = tmp + 0x24;		half_arr= tmp + 0x3a;	/* This table needs 20x16 bytes */
 								s1p19r = tmp + 0x26;
 	  #endif
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 		tmp = half_arr;
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix224_ditN_cy_dif1.c b/src/radix224_ditN_cy_dif1.c
index 4dce5db4..390fd14d 100755
--- a/src/radix224_ditN_cy_dif1.c
+++ b/src/radix224_ditN_cy_dif1.c
@@ -459,7 +459,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(first_entry)
 	{
-		ASSERT(HERE, LO_ADD,"LO_ADD");
+		ASSERT(LO_ADD,"LO_ADD");
 		psave = p;	nsave = n;
 		radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)RADIX));
 		n2inv     = qfdbl(qf_rational_quotient((int64)1, (int64)(n/2)));
@@ -491,11 +491,11 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -533,7 +533,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -543,7 +543,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -588,24 +588,24 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of radix224_creals_in_local_store vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix224_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix224_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -648,8 +648,8 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x(396 + e0 + 2) = 0x478; This is where the value of half_arr_offset224 comes from
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, half_arr_offset224 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix224_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix224_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset224 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix224_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix224_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
@@ -750,7 +750,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1640,12 +1640,12 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1680,7 +1680,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1731,7 +1731,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				};
 			}
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -2147,8 +2147,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -2158,8 +2158,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -2168,19 +2168,19 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2189,11 +2189,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -2203,8 +2203,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -2426,7 +2426,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy224_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy224_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -2436,7 +2436,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -3478,8 +3478,8 @@ void radix224_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -4058,17 +4058,17 @@ void radix224_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
 	  #if defined(USE_AVX2) || defined(USE_ARM_V8_SIMD)
 		// AVX2 (i.e. FMA)means non-Nussbaumer radix-7, uses these sincos constants:
-		ASSERT(HERE, (ds3->d0 == 0.0 && ds3->d1 == 0.0), "thread-local memcheck failed!");
+		ASSERT((ds3->d0 == 0.0 && ds3->d1 == 0.0), "thread-local memcheck failed!");
 	  #else
 		/* SSE2 version assumes LO_ADD = 0, i.e. the low-mul Nussbaumer-style DFT implementation: */
-		ASSERT(HERE, (ds3->d0 == sx3 && ds3->d1 == sx3), "thread-local memcheck failed!");
+		ASSERT((ds3->d0 == sx3 && ds3->d1 == sx3), "thread-local memcheck failed!");
 	  #endif
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -4077,15 +4077,15 @@ void radix224_dit_pass1(double a[], int n)
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix240_ditN_cy_dif1.c b/src/radix240_ditN_cy_dif1.c
index 5b224f28..fde8736a 100755
--- a/src/radix240_ditN_cy_dif1.c
+++ b/src/radix240_ditN_cy_dif1.c
@@ -477,11 +477,11 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -519,7 +519,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -529,7 +529,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -572,24 +572,24 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix240_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix240_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -662,8 +662,8 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, half_arr_offset240 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix240_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix240_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset240 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix240_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix240_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one  , 1.0  );
@@ -753,7 +753,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1174,12 +1174,12 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1214,7 +1214,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1265,7 +1265,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				};
 			}	//	printf("wts_idx_incr = %u\n",wts_idx_incr);
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1748,8 +1748,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1759,8 +1759,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1769,19 +1769,19 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -1790,11 +1790,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1804,8 +1804,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -2026,7 +2026,7 @@ for(outer=0; outer <= 1; outer++)
 	/*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/
 	for(j = 0; j < main_work_units; ++j)
 	{
-		ASSERT(HERE, 0x0 == cy240_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy240_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -2036,7 +2036,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2202,7 +2202,7 @@ void radix240_dif_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n/RADIX;
 
@@ -2428,7 +2428,7 @@ void radix240_dit_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n/RADIX;
 
@@ -2853,8 +2853,8 @@ void radix240_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2991,10 +2991,10 @@ void radix240_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -3003,15 +3003,15 @@ void radix240_dit_pass1(double a[], int n)
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix240_main_carry_loop.h b/src/radix240_main_carry_loop.h
index c1aa87d8..093364cf 100755
--- a/src/radix240_main_carry_loop.h
+++ b/src/radix240_main_carry_loop.h
@@ -121,7 +121,7 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 			ke = kd-1; ke += (-(ke < 0))&15;	kd = (kd << 5) + jt;
 												ke = (ke << 5) + jt;
 		//	printf("15-DFT #%2u: [k0-E]/2 = %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",l,k0/2,k1/2,k2/2,k3/2,k4/2,k5/2,k6/2,k7/2,k8/2,k9/2,ka/2,kb/2,kc/2,kd/2,ke/2);
-		//	printf("0x0%2X%2X%2X%2X%2X%2X%2X,0x%2X%2X%2X%2X%2X%2X%2X%2X\n",ke/2,kd/2,kc/2,kb/2,ka/2,k9/2,k8/2,k7/2,k6/2,k5/2,k4/2,k3/2,k2/2,k1/2,k0/2);
+		//	printf("0x0%2X%2X%2X%2X%2X%2X%2X,%#2X%2X%2X%2X%2X%2X%2X%2X\n",ke/2,kd/2,kc/2,kb/2,ka/2,k9/2,k8/2,k7/2,k6/2,k5/2,k4/2,k3/2,k2/2,k1/2,k0/2);
 			// Input ptrs:		// Output ptrs:
 			va0 = tmp     ;		vc0 = tm2 + k0;
 			va1 = tmp+0x02;		vc1 = tm2 + k1;
diff --git a/src/radix24_ditN_cy_dif1.c b/src/radix24_ditN_cy_dif1.c
index cf4478ad..08eb441a 100755
--- a/src/radix24_ditN_cy_dif1.c
+++ b/src/radix24_ditN_cy_dif1.c
@@ -299,7 +299,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "radix24_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "radix24_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -361,11 +361,11 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -404,7 +404,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -414,7 +414,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -451,18 +451,18 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix24_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix24_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 48 16-byte slots of sc_arr for temporaries, next 2 for the doubled cos and c3m1 terms,
 	next 12 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -855,7 +855,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		p07 = p06 + p01;
 		p08 = p07 + p01;
 		p16 = p08 + p08;
-		ASSERT(HERE, p16 == p08+p08, "p16 != p08+p08; radix24 ASM macro requires this!");
+		ASSERT(p16 == p08+p08, "p16 != p08+p08; radix24 ASM macro requires this!");
 		p01 = p01 + ( (p01 >> DAT_BITS) << PAD_BITS );
 		p02 = p02 + ( (p02 >> DAT_BITS) << PAD_BITS );
 		p03 = p03 + ( (p03 >> DAT_BITS) << PAD_BITS );
@@ -992,12 +992,12 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		_cy_22	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_22== 0x0);
 		_cy_23	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_23== 0x0);
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix24_ditN_cy_dif1.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix24_ditN_cy_dif1.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/24-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix24_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix24_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1021,7 +1021,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		first_entry=FALSE;
 	}	/* endif(first_entry) */
@@ -1198,8 +1198,8 @@ for(outer=0; outer <= 1; outer++)
 	for(ithread = 0; ithread < CY_THREADS; ithread++)
 	{
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1209,8 +1209,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1219,26 +1219,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].s1p00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].s1p00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		tdat[ithread].bjmodn00 = _bjmodn00[ithread];
@@ -1507,7 +1507,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy24_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy24_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1517,7 +1517,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix32_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1595,7 +1595,7 @@ for(outer=0; outer <= 1; outer++)
 
 	for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 	{
-		ASSERT(HERE, CY_THREADS > 1,"radix24_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
+		ASSERT(CY_THREADS > 1,"radix24_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
 		_cy_00[ithread] = _cy_00[ithread-1];
 		_cy_01[ithread] = _cy_01[ithread-1];
 		_cy_02[ithread] = _cy_02[ithread-1];
@@ -2018,8 +2018,8 @@ void radix24_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2123,21 +2123,21 @@ void radix24_dit_pass1(double a[], int n)
 		sse2_rnd= s1p00 + 0x41;
 		half_arr= s1p00 + 0x42;	/* This table needs 20x16 bytes */
 	  #endif
-		ASSERT(HERE, (s1p00 == thread_arg->s1p00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((s1p00 == thread_arg->s1p00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix256_ditN_cy_dif1.c b/src/radix256_ditN_cy_dif1.c
index c29b3a7c..97026476 100755
--- a/src/radix256_ditN_cy_dif1.c
+++ b/src/radix256_ditN_cy_dif1.c
@@ -263,7 +263,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
 	struct complex t[RADIX], *tptr;
-	double *addr,*addi;
+	const double *addr,*addi;
 	int *itmp,*itm2;	// Pointer into the bjmodn array
 	int err;
 	static int first_entry=TRUE;
@@ -414,11 +414,11 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#if !defined(USE_SSE2) && defined(USE_FMA)
 		// Precompute the FMA-modified twiddles for the 2nd-pass radix-16 DFTs:
@@ -552,7 +552,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -562,7 +562,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -605,23 +605,23 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 256 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread
 		cslots_in_local_store = radix256_creals_in_local_store + (20+RADIX/2)/2;	// Just add enough int64 space for both cases, plus some
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix256_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -665,8 +665,8 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, half_arr_offset256 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix256_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix256_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset256 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix256_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix256_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one  , 1.0  );
@@ -815,7 +815,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				qc = qfcos(qt);	qs = qfsin(qt);
 				qx = QONE;		qy = QZRO;
 				for(j = 0; j < RADIX; j++) {
-					printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+					printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 					// Up-multiply the complex exponential:
 					qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 					qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1263,14 +1263,14 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/radix-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < NDIVR/CY_THREADS; j++)
@@ -1290,7 +1290,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 			if(CY_THREADS > 1)
 			{
 				for(ithread = 1; ithread < CY_THREADS; ithread++)
@@ -1469,8 +1469,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1480,8 +1480,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1490,20 +1490,20 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
-		ASSERT(HERE, ((tmp + 0x400)->d0 == 2.0 && (tmp + 0x400)->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT(((tmp + 0x400)->d0 == 2.0 && (tmp + 0x400)->d1 == 2.0), "thread-local memcheck failed!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1513,11 +1513,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1530,8 +1530,8 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_SSE2)
 			// This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots:
-			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			// scale gets set immediately prior to calling carry macro, hence no use checking it here.
 			/* init carries	*/
@@ -1754,7 +1754,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy256_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy256_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1764,7 +1764,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("%s end  ; #tasks = %d, #free_tasks = %d\n",func, tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -2276,8 +2276,8 @@ void radix256_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2438,12 +2438,12 @@ void radix256_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
-	//	ASSERT(HERE, (isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!");	Disable to allow alternate "rounded down" variant of isrt2,sqrt2
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
+	//	ASSERT((isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!");	Disable to allow alternate "rounded down" variant of isrt2,sqrt2
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2453,18 +2453,18 @@ void radix256_dit_pass1(double a[], int n)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#else	// SSE2:
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		} else {
 		#ifdef USE_AVX512
 			/* No-Op */
 		#else
-			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		}
 
diff --git a/src/radix256_main_carry_loop.h b/src/radix256_main_carry_loop.h
index f6cc4398..c865409d 100755
--- a/src/radix256_main_carry_loop.h
+++ b/src/radix256_main_carry_loop.h
@@ -365,8 +365,8 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee
 		// In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass:
 		if(target_idx == j) {
 		#ifdef USE_SSE2
-			addr = (double *)s1p00 + target_set;
-			*addr += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
+			double *addr_ = (double *)s1p00 + target_set;
+			*addr_ += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
 		#else
 			// target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum:
 			l = target_set&1;	target_set >>= 1;
@@ -667,26 +667,28 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee
 	  if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) {	// LOACC with tunable DWT-weights chaining
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy_r; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy_r;
 		for(ntmp = 0; ntmp < RADIX>>2; ntmp++) {
 			jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp];	// poff[] = p04,08,...
 			// Re-init weights every 4th macro invocation to keep errors under control:
-			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  } else {	// HiACC:
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy_r; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy_r;
 		for(ntmp = 0; ntmp < RADIX>>2; ntmp++) {
 			jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp];	// poff[] = p04,08,...
-			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_pow2_errcheck0(a[jt    ],a[jp    ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  }	// LOACC or HIACC?
@@ -880,13 +882,14 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee
 	#else	// Scalar-double mode:
 
 		// Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2):
-		ntmp = 0; addr = cy_r; addi = cy_i;
+		ntmp = 0;
+		double *addr_ = cy_r, *addi_ = cy_i;
 		for(m = 0; m < RADIX>>2; m++) {
 			jt = j1 + poff[m]; jp = j2 + poff[m];	// poff[] = p04,08,...
-			fermat_carry_norm_pow2_errcheck(a[jt    ],a[jp    ],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
-			fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr; ++addi;
+			fermat_carry_norm_pow2_errcheck(a[jt    ],a[jp    ],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
+			fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult);	ntmp += NDIVR; ++addr_; ++addi_;
 		}
 
 	#endif	/* #ifdef USE_SSE2 */
diff --git a/src/radix288_ditN_cy_dif1.c b/src/radix288_ditN_cy_dif1.c
index 31909e73..d074803f 100755
--- a/src/radix288_ditN_cy_dif1.c
+++ b/src/radix288_ditN_cy_dif1.c
@@ -299,7 +299,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -367,11 +367,11 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -409,7 +409,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -419,7 +419,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -459,24 +459,24 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix288_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix288_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix288_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -520,7 +520,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x(492 + 90 + 2) = 0x524; This is where the value of half_arr_offset288 comes from
 		half_arr= tmp + 0x02;
 	  #endif
-		ASSERT(HERE, (radix288_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix288_creals_in_local_store checksum failed!");
+		ASSERT((radix288_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix288_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
 		VEC_DBL_INIT(sqrt2, SQRT2);	VEC_DBL_INIT(isrt2, ISRT2);
@@ -1573,12 +1573,12 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1602,7 +1602,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1755,8 +1755,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1766,8 +1766,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1776,26 +1776,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1928,7 +1928,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy288_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy288_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1938,7 +1938,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -3086,8 +3086,8 @@ void radix288_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -3856,21 +3856,21 @@ void radix288_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix28_ditN_cy_dif1.c b/src/radix28_ditN_cy_dif1.c
index 64c4dc73..08d84010 100755
--- a/src/radix28_ditN_cy_dif1.c
+++ b/src/radix28_ditN_cy_dif1.c
@@ -518,7 +518,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(first_entry)
 	{
-		ASSERT(HERE, LO_ADD,"LO_ADD");
+		ASSERT(LO_ADD,"LO_ADD");
 		psave = p;	nsave = n;
 		radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)RADIX));
 		n2inv     = qfdbl(qf_rational_quotient((int64)1, (int64)(n/2)));
@@ -544,11 +544,11 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -586,7 +586,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -596,7 +596,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -639,24 +639,24 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of radix28_creals_in_local_store vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix28_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix28_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 56 16-byte slots of sc_arr for temporaries, next 8 for the nontrivial complex 16th roots,
 	next 28 for the doubled carry pairs, next 2 for ROE and RND_CONST, next RADIX for the half_arr table lookup stuff,
@@ -1154,13 +1154,13 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 		poff[0] =   0; poff[1] = p04; poff[2] = p08; poff[3] = p12; poff[4] = p16; poff[5] = p20; poff[6] = p24;
 
-		ASSERT(HERE, p01+p01 == p02, "p01+p01 != p02");
-		ASSERT(HERE, p02+p02 == p04, "p02+p02 != p04");
-		ASSERT(HERE, p04+p04 == p08, "p04+p04 != p08");
-		ASSERT(HERE, p08+p04 == p12, "p08+p04 != p12");
-		ASSERT(HERE, p12+p04 == p16, "p12+p04 != p16");
-		ASSERT(HERE, p16+p04 == p20, "p16+p04 != p20");
-		ASSERT(HERE, p20+p04 == p24, "p20+p04 != p24");
+		ASSERT(p01+p01 == p02, "p01+p01 != p02");
+		ASSERT(p02+p02 == p04, "p02+p02 != p04");
+		ASSERT(p04+p04 == p08, "p04+p04 != p08");
+		ASSERT(p08+p04 == p12, "p08+p04 != p12");
+		ASSERT(p12+p04 == p16, "p12+p04 != p16");
+		ASSERT(p16+p04 == p20, "p16+p04 != p20");
+		ASSERT(p20+p04 == p24, "p20+p04 != p24");
 
 		if(_cy_r[0])	/* If it's a new exponent of a range test, need to deallocate these. */
 		{
@@ -1197,12 +1197,12 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1237,7 +1237,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1288,7 +1288,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				};
 			}
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1648,8 +1648,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1659,8 +1659,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1669,27 +1669,27 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].s1p00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].s1p00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].s1p00r;
-		ASSERT(HERE, ((tmp + 0x38)->d0 == 2.0 && (tmp + 0x38)->d1 == 2.0), "thread-local memcheck failed!");
-		ASSERT(HERE, ((tmp + half_arr_offset28-1)->d0 == crnd && (tmp + half_arr_offset28-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp + 0x38)->d0 == 2.0 && (tmp + 0x38)->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT(((tmp + half_arr_offset28-1)->d0 == crnd && (tmp + half_arr_offset28-1)->d1 == crnd), "thread-local memcheck failed!");
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 		#ifdef USE_AVX
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp + half_arr_offset28+40)->d0 * (tmp + half_arr_offset28+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp + half_arr_offset28+40)->d1 * (tmp + half_arr_offset28+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp + half_arr_offset28+40)->d0 * (tmp + half_arr_offset28+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp + half_arr_offset28+40)->d1 * (tmp + half_arr_offset28+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp + half_arr_offset28+10)->d0 * (tmp + half_arr_offset28+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp + half_arr_offset28+10)->d1 * (tmp + half_arr_offset28+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp + half_arr_offset28+10)->d0 * (tmp + half_arr_offset28+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp + half_arr_offset28+10)->d1 * (tmp + half_arr_offset28+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1699,8 +1699,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp + half_arr_offset28)->d0 * (tmp + half_arr_offset28+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp + half_arr_offset28)->d1 * (tmp + half_arr_offset28+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp + half_arr_offset28)->d0 * (tmp + half_arr_offset28+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp + half_arr_offset28)->d1 * (tmp + half_arr_offset28+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1860,7 +1860,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy28_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy28_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1870,7 +1870,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2422,8 +2422,8 @@ void radix28_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2510,29 +2510,29 @@ void radix28_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes */
 	  #endif
 
-		ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
 	  #if defined(USE_AVX2) || defined(USE_ARM_V8_SIMD)
 		// AVX2 (i.e. FMA)means non-Nussbaumer radix-7, uses these sincos constants:
-		ASSERT(HERE, (ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!");
+		ASSERT((ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!");
 	  #else
 		/* SSE2 version assumes LO_ADD = 0, i.e. the low-mul Nussbaumer-style DFT implementation: */
-		ASSERT(HERE, (ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!");
+		ASSERT((ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!");
 	  #endif
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 	{
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix320_ditN_cy_dif1.c b/src/radix320_ditN_cy_dif1.c
index 9da345b1..fdb8248f 100755
--- a/src/radix320_ditN_cy_dif1.c
+++ b/src/radix320_ditN_cy_dif1.c
@@ -324,7 +324,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -392,11 +392,11 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -434,7 +434,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -444,7 +444,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -484,24 +484,24 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix320_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix320_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix320_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -533,7 +533,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0x(508 + a0) = 0x5b0; This is where the value of half_arr_offset320 comes from
 		half_arr= tmp + 0x02;
 	  #endif
-		ASSERT(HERE, (radix320_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix320_creals_in_local_store checksum failed!");
+		ASSERT((radix320_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix320_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );
 		VEC_DBL_INIT(ycc1, cc1  );	// radix-5 DFT trig consts
@@ -1203,12 +1203,12 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1232,7 +1232,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1385,8 +1385,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1396,8 +1396,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1406,26 +1406,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1555,7 +1555,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy320_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy320_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1565,7 +1565,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2678,8 +2678,8 @@ void radix320_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2906,21 +2906,21 @@ void radix320_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix32_dif_dit_pass.c b/src/radix32_dif_dit_pass.c
index 4b9167f4..5b53e809 100755
--- a/src/radix32_dif_dit_pass.c
+++ b/src/radix32_dif_dit_pass.c
@@ -99,15 +99,15 @@ void radix32_dif_pass(double a[], int n, struct complex rt0[], struct complex rt
 	{
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage
 			free((void *)sc_arr);	sc_arr=0x0;
 		}
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 64 16-byte slots of sc_arr for temporaries, next 7 for the nontrivial complex 32nd roots,
 	last 64 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array.
@@ -169,7 +169,7 @@ void radix32_dif_pass(double a[], int n, struct complex rt0[], struct complex rt
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		r00 = __r0 + thr_id*0x90;
 		cc0	= r00 + 0x41;
 	#endif
@@ -197,8 +197,8 @@ void radix32_dif_pass(double a[], int n, struct complex rt0[], struct complex rt
 	p14 = p14 + ( (p14 >> DAT_BITS) << PAD_BITS );
 	p18 = p18 + ( (p18 >> DAT_BITS) << PAD_BITS );
 	p1C = p1C + ( (p1C >> DAT_BITS) << PAD_BITS );
-	ASSERT(HERE, p04+p04 == p08, "p04+p04 != p08");
-	ASSERT(HERE, p04+p08 == p0C, "p04+p08 != p0C");
+	ASSERT(p04+p04 == p08, "p04+p04 != p08");
+	ASSERT(p04+p08 == p0C, "p04+p08 != p0C");
 
 /*...The radix-32 pass is here.	*/
 
@@ -1477,15 +1477,15 @@ void radix32_dit_pass(double a[], int n, struct complex rt0[], struct complex rt
 	{
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage
 			free((void *)sc_arr);	sc_arr=0x0;
 		}
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 64 16-byte slots of sc_arr for temporaries, next 7 for the nontrivial complex 32nd roots,
 	last 64 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array.
@@ -1545,7 +1545,7 @@ void radix32_dit_pass(double a[], int n, struct complex rt0[], struct complex rt
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
   #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 	r00 = __r0 + thr_id*0x90;
 	isrt2 = r00 + 0x40;
 	cc0	= isrt2 + 1;
diff --git a/src/radix32_ditN_cy_dif1.c b/src/radix32_ditN_cy_dif1.c
index 4728a9e8..ec34bf81 100755
--- a/src/radix32_ditN_cy_dif1.c
+++ b/src/radix32_ditN_cy_dif1.c
@@ -324,11 +324,11 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
 	//	printf("0: wt*inv-1 = %15.8e\n",fabs(wts_mult[0]*inv_mult[0] - 1.));
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 	//	printf("1: wt*inv-1 = %15.8e\n",fabs(wts_mult[1]*inv_mult[1] - 1.));
 
 	#ifdef MULTITHREAD
@@ -368,7 +368,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -379,7 +379,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 				main_work_units = 0;
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 		}
@@ -426,23 +426,23 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread
 		cslots_in_local_store = radix32_creals_in_local_store + (20+RADIX/2)/2;	// Just add enough int64 space for both cases, plus some
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix32_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 64 vec_ddl-sized slots of sc_arr for temporaries, next 7 for the nontrivial complex 16th roots,
 	next 32 for the vector carries, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -489,8 +489,8 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-//		ASSERT(HERE, half_arr_offset32 == (uint32)(half_arr-sc_ptr), "half_arr_offset32 mismatches actual!");
-		ASSERT(HERE, (radix32_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix32_creals_in_local_store checksum failed!");
+//		ASSERT(half_arr_offset32 == (uint32)(half_arr-sc_ptr), "half_arr_offset32 mismatches actual!");
+		ASSERT((radix32_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix32_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );		VEC_DBL_INIT(one, 1.0  );
@@ -632,7 +632,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					// Up-multiply the complex exponential:
 					qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 					qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
-					printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+					printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				}
 				exit(0);
 			#endif
@@ -999,14 +999,14 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/radix-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < NDIVR/CY_THREADS; j++)
@@ -1026,7 +1026,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 			if(CY_THREADS > 1)
 			{
 				for(ithread = 1; ithread < CY_THREADS; ithread++)
@@ -1205,8 +1205,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1216,8 +1216,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1228,20 +1228,20 @@ for(outer=0; outer <= 1; outer++)
 		// Dec 2015: fast-GCD usage of this routine may involve multiple 'main' arrays
 		// on successive calls, so set here at runtime rather than in init-only block:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
-		ASSERT(HERE, ((tmp + 0x40)->d0 == 2.0 && (tmp + 0x40)->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT(((tmp + 0x40)->d0 == 2.0 && (tmp + 0x40)->d1 == 2.0), "thread-local memcheck failed!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1251,11 +1251,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1268,8 +1268,8 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_SSE2)
 			// This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots:
-			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1492,7 +1492,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy32_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy32_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1502,7 +1502,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix32_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1578,7 +1578,7 @@ for(outer=0; outer <= 1; outer++)
 			// Must use NDIVR instead of p1 here since p1 may have pads which are not applied to element-2-slots-before
 			j1 = NDIVR-2;	j1 += ( (j1 >> DAT_BITS) << PAD_BITS );
 			j2 = j1+RE_IM_STRIDE;
-			ASSERT(HERE, t[RADIX-1].re <= 1.0 && t[RADIX-1].im <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!");
+			ASSERT(t[RADIX-1].re <= 1.0 && t[RADIX-1].im <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!");
 			// Undo the initial dif pass just for the 16 complex terms in question:
 			RADIX_32_DIT(\
 				a+j1,arr_offsets,RE_IM_STRIDE,\
@@ -1595,11 +1595,11 @@ for(outer=0; outer <= 1; outer++)
 			// Verify that any cyout = 1 has the corresponding high word < 0,
 			// then absorb cyout back into the high word and zero the carry:
 			if(t[RADIX-1].re == 1.0) {
-				ASSERT(HERE, a[j1+p1C+p03] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!");
+				ASSERT(a[j1+p1C+p03] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!");
 				a[j1+p1C+p03] += FFT_MUL_BASE;	t[RADIX-1].re = 0.0;
 			}
 			if(t[RADIX-1].im == 1.0) {
-				ASSERT(HERE, a[j2+p1C+p03] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!");
+				ASSERT(a[j2+p1C+p03] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!");
 				a[j2+p1C+p03] += FFT_MUL_BASE;	t[RADIX-1].im = 0.0;
 			}
 			// Redo the initial dif pass just for the 16 complex terms in question:
@@ -1940,8 +1940,8 @@ void radix32_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2028,10 +2028,10 @@ void radix32_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2041,18 +2041,18 @@ void radix32_dit_pass1(double a[], int n)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#else	// SSE2:
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		} else {
 		#ifdef USE_AVX512
 			/* No-Op */
 		#else
-			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		}
 
diff --git a/src/radix32_dyadic_square.c b/src/radix32_dyadic_square.c
index 50c615a6..09338f5a 100755
--- a/src/radix32_dyadic_square.c
+++ b/src/radix32_dyadic_square.c
@@ -166,7 +166,7 @@ void radix32_dyadic_square(
 		b = (double *)(fwd_fft_only & ~0xCull);
 		// BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0:
 		if(fwd_fft_only & 0xC) {
-			ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
+			ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
 			fwd_fft_only = 3ull;
 		}
 	}
@@ -184,9 +184,9 @@ void radix32_dyadic_square(
 /**************************************************************************************************************************************/
 	if((rad0save != radix0) || (nsave != n))
 	{
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		nsave = n;
-		ASSERT(HERE, N2 == n/2, "N2 bad!");
+		ASSERT(N2 == n/2, "N2 bad!");
 		rad0save = radix0;
 		ndivrad0 = n/radix0;	ndivrad0m1 = ndivrad0-1;	// ndivrad0 always a power of 2, so can do a fast-mod via & (ndivrad0-1)
 		for(j = 0; j < ndivrad0; j += stride)
@@ -194,7 +194,7 @@ void radix32_dyadic_square(
 			j1 = j + ( (j >> DAT_BITS) << PAD_BITS );
 			if( (j1+stridh) != (j+stridh) + ( ((j+stridh) >> DAT_BITS) << PAD_BITS ) ) {
 				printf("j, j1, stride/2 = %d,%d,%d, jpad = %d\n",j,j1, stridh, (j+stridh) + (((j+stridh) >> DAT_BITS) << PAD_BITS) );
-				ASSERT(HERE, 0 , "add1 calculation violates padded index rules!");
+				ASSERT(0 , "add1 calculation violates padded index rules!");
 			}
 		}
 		// Nov 2017: For the non-synthetic final-pass radices (16 and 32) the default contiguous-data chunksize
@@ -214,9 +214,9 @@ void radix32_dyadic_square(
 		!   Allocate and initialize an index array containing N/32 indices...
 
 		index_ptmp = ALLOC_INT(N2/32);
-		if(!index_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array INDEX in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array INDEX in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		index = ALIGN_INT(index_ptmp);
-		if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		for(i=0; i < N2/32; i++)
 		{
 			index[i]=i;
@@ -226,11 +226,11 @@ void radix32_dyadic_square(
 		index1_mod = (n>>6)/radix0;	/* complex length requires an additional divide by 2 */
 
 		index_ptmp0 = ALLOC_INT(index_ptmp0, index0_mod);
-		if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		index0 = ALIGN_INT(index_ptmp0);
 
 		index_ptmp1 = ALLOC_INT(index_ptmp1, index1_mod);
-		if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		index1 = ALIGN_INT(index_ptmp1);
 
 		for(i=0; i < index0_mod; i++){index0[i]=       i;}
@@ -248,7 +248,7 @@ void radix32_dyadic_square(
 			if(i == radix0)
 				break;
 		}
-		if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 		bit_reverse_int(index0, index0_mod,                 nradices_prim_radix0, &radix_prim[nradices_prim_radix0-1], -1, (int *)arr_scratch);
 		bit_reverse_int(index1, index1_mod, nradices_prim-5-nradices_prim_radix0, &radix_prim[nradices_prim       -6], -1, (int *)arr_scratch);
@@ -263,10 +263,10 @@ void radix32_dyadic_square(
 		if(init_sse2 <= max_threads)	// current alloc sufficient
 			return;
 
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
 
 	#ifdef USE_SSE2
@@ -276,14 +276,14 @@ void radix32_dyadic_square(
 		}
 		// Index vectors used in SIMD roots-computation.
 		// Nov 2017: Add pair of int-slots per thread here ----vv, to support synthesized final-pass radices >= 256.
-	//	sm_arr = ALLOC_INT(sm_arr, max_threads*(14*RE_IM_STRIDE+2) + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		sm_arr = ALLOC_INT(sm_arr, max_threads* 14*RE_IM_STRIDE    + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+	//	sm_arr = ALLOC_INT(sm_arr, max_threads*(14*RE_IM_STRIDE+2) + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		sm_arr = ALLOC_INT(sm_arr, max_threads* 14*RE_IM_STRIDE    + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sm_ptr = ALIGN_INT(sm_arr);
-		ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 		// Twiddles-array:
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x94*max_threads + 100);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x94*max_threads + 100);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 64 vec_dbl slots of sc_arr for temporaries, next 8 for scratch, next 7 for the nontrivial complex 16th roots,
 	next 62 for the doubled sincos twiddles, next 4 for [1.0,2.0,{0.25, unused in fermat-mod mode},sqrt2] and at least 3 more to allow for 64-byte alignment of the array.
@@ -381,7 +381,7 @@ void radix32_dyadic_square(
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
   #ifdef USE_SSE2
 	k1_arr =   __i0 + thr_id*14*RE_IM_STRIDE;
 	k2_arr = k1_arr + 7*RE_IM_STRIDE;
@@ -432,9 +432,9 @@ void radix32_dyadic_square(
 #endif
 
 	/*...If a new runlength, should not get to this point: */
-	ASSERT(HERE, n == nsave,"n != nsave");
-	ASSERT(HERE, incr == 64,"incr == 64");
-//	ASSERT(HERE, ndivrad0 == n/radix0,"bad value for ndivrad0!");	Synthesized final-pass radices break this
+	ASSERT(n == nsave,"n != nsave");
+	ASSERT(incr == 64,"incr == 64");
+//	ASSERT(ndivrad0 == n/radix0,"bad value for ndivrad0!");	Synthesized final-pass radices break this
 	/*
 	k = ii*(ndivrad0 >> 6);
 	*/
@@ -1039,8 +1039,8 @@ printf("c[%2d] = %18.15f,%18.15f,%18.15f,%18.15f,%18.15f,%18.15f,%18.15f,%18.15f
 
 		add0 = &a[j1];
 		add1 = &a[j1+stridh];
-	//	printf("stride = %d, add0,1 = %llX, %llX, diff = %llX\n",stride,(int64)add0,(int64)add1, (int64)add1-(int64)add0);	exit(0);
-	//	ASSERT(HERE, (j1+stride) == (j+stride) + ( ((j+stride) >> DAT_BITS) << PAD_BITS ) , "add1 calculation violates padded index rules!");
+	//	printf("stride = %d, add0,1 = %" PRIX64 ", %" PRIX64 ", diff = %" PRIX64 "\n",stride,(int64)add0,(int64)add1, (int64)add1-(int64)add0);	exit(0);
+	//	ASSERT((j1+stride) == (j+stride) + ( ((j+stride) >> DAT_BITS) << PAD_BITS ) , "add1 calculation violates padded index rules!");
 	  #ifdef USE_AVX512	// The generic pre-dyadic-square macro needs 8 main-array addresses in AVX mode
 	  					// because (add[1,3,5,7]-add[0,2,4,6]) have opposite signs for Fermat and Mersenne-mod:
 		add1 = add0 +  64;
diff --git a/src/radix32_wrapper_ini.c b/src/radix32_wrapper_ini.c
index 59094d2c..fa40b1dd 100755
--- a/src/radix32_wrapper_ini.c
+++ b/src/radix32_wrapper_ini.c
@@ -75,7 +75,7 @@ void radix32_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int r
 				ws_m           [iblock_next] = m           ;
 				ws_blocklen    [iblock_next] = blocklen    ;
 				ws_blocklen_sum[iblock_next] = blocklen_sum;
-			//	printf("%8llu  %20llu  %8llu: init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
+			//	printf("%8" PRIu64 "  %20" PRIu64 "  %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k);
 				return;
 			}
 	jump_in:	// Entry point for all blocks but the first.
diff --git a/src/radix32_wrapper_square.c b/src/radix32_wrapper_square.c
index 06732110..88687a5a 100755
--- a/src/radix32_wrapper_square.c
+++ b/src/radix32_wrapper_square.c
@@ -216,7 +216,7 @@ void radix32_wrapper_square(
 		b = (double *)(fwd_fft_only & ~0xCull);
 		// BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0:
 		if(fwd_fft_only & 0xC) {
-			ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
+			ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!");	// Otherwise bits 2:3 should've been zeroed prior to entry
 			fwd_fft_only = 3ull;
 		}
 	}
@@ -235,10 +235,10 @@ void radix32_wrapper_square(
 		nsave = n;
 		if(init_sse2 > max_threads)	// current SIMD local-alloc insufficient
 		{
-			ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+			ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			max_threads = init_sse2;
 		#ifndef COMPILER_TYPE_GCC
-			ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+			ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 		#endif
 		//	printf("%Ns: max_threads = %d, NTHREADS = %d\n",func, max_threads, NTHREADS);
 
@@ -250,14 +250,14 @@ void radix32_wrapper_square(
 		// Index vectors used in SIMD roots-computation.
 		// The AVX512 compute-sincos-mults code needs 2 elements per complex-double-load, so use 14*RE_IM_STRIDE per array
 		// to alloc storage here for all cases, even though that leaves upper array halves unused for sub-AVX512.
-		sm_arr = ALLOC_INT(sm_arr, max_threads*28*RE_IM_STRIDE + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sm_arr = ALLOC_INT(sm_arr, max_threads*28*RE_IM_STRIDE + 16);	if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sm_ptr = ALIGN_INT(sm_arr);
-		ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 		// Twiddles-array: Need 0x92 slots for data, plus need to leave room to pad-align.
 		// v20: To support inline a*(b-c) for p-1 stage 2, need 2*RADIX = 64 added vec_dbl, thus 0x98 ==> 0xd8:
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 0xd8*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 0xd8*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 		/* Use low 64 vec_dbl slots of sc_arr for temporaries, next 8 for scratch, next 7 for the nontrivial complex 16th roots,
 		next 62 for the doubled sincos twiddles, next 4 for [1.0,2.0,0.25,sqrt2] and at least 3 more to allow for 64-byte alignment of the array.
@@ -362,7 +362,7 @@ void radix32_wrapper_square(
 		!   for the itmp space and that sent to the bit_reverse_int for scratch space
 		!   don't overlap:
 		*/
-		ASSERT(HERE, N2 == n/2, "N2 bad!");
+		ASSERT(N2 == n/2, "N2 bad!");
 		itmp = (int *)&arr_scratch[N2/32];	/* Conservatively assume an int might be as long as 8 bytes here */
 		for(i=0; i < N2/32; i++)
 		{
@@ -452,7 +452,7 @@ void radix32_wrapper_square(
 			free((void *)index_ptmp);	index_ptmp=0x0;
 		}
 		index_ptmp = ALLOC_INT(index_ptmp, N2/32);
-		ASSERT(HERE, index_ptmp != 0,"ERROR: unable to allocate array INDEX!");
+		ASSERT(index_ptmp != 0,"ERROR: unable to allocate array INDEX!");
 		index      = ALIGN_INT(index_ptmp);
 	/*
 	!...Now rearrange FFT sincos indices using the main loop structure as a template.
@@ -491,7 +491,7 @@ void radix32_wrapper_square(
 		  if(j2_start == n-64)break;
 
 		  blocklen_sum = blocklen_sum + blocklen;
-		  ASSERT(HERE, i != 0,"ERROR 10!");
+		  ASSERT(i != 0,"ERROR 10!");
 		  blocklen = (radix_prim[i-1]-1)*blocklen_sum;
 
 		  j2_start = j2_start+(blocklen<<2);
@@ -511,7 +511,7 @@ void radix32_wrapper_square(
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
   #ifdef USE_SSE2
 	k1_arr = __i0 + thr_id*28*RE_IM_STRIDE;
 	k2_arr = k1_arr +      14*RE_IM_STRIDE;
@@ -559,7 +559,7 @@ void radix32_wrapper_square(
   #endif
 #endif
 	/*...If a new runlength, should not get to this point: */
-	ASSERT(HERE, n == nsave,"n != nsave");
+	ASSERT(n == nsave,"n != nsave");
 
 /*
 !   SOLVING THE CACHE FLOW PROBLEM FOR BIT-REVERSED ARRAY DATA:
diff --git a/src/radix352_ditN_cy_dif1.c b/src/radix352_ditN_cy_dif1.c
index 7ca5e726..1a0a4085 100755
--- a/src/radix352_ditN_cy_dif1.c
+++ b/src/radix352_ditN_cy_dif1.c
@@ -340,7 +340,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -408,11 +408,11 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -450,7 +450,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -460,7 +460,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -500,24 +500,24 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix352_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix352_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix352_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -588,7 +588,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfadd(qt,qtheta);	cq4 = qfcos(qt);	sq4 = qfsin(qt);
 		//================================================================
 	  #endif
-		ASSERT(HERE, (radix352_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix352_creals_in_local_store checksum failed!");
+		ASSERT((radix352_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix352_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
 	  #if 1
@@ -1425,12 +1425,12 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1454,7 +1454,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1607,8 +1607,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1618,8 +1618,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1628,26 +1628,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1660,7 +1660,7 @@ for(outer=0; outer <= 1; outer++)
 /******************* AVX debug stuff: *******************/
 #if 0
 	int ipad;
-	ASSERT(HERE, p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!");
+	ASSERT(p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!");
 	// Use RNG to populate data array:
 	rng_isaac_init(TRUE);
 	double dtmp = 1024.0*1024.0*1024.0*1024.0;
@@ -1935,7 +1935,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy352_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy352_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1945,7 +1945,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2901,8 +2901,8 @@ void radix352_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -3432,21 +3432,21 @@ void radix352_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix36_ditN_cy_dif1.c b/src/radix36_ditN_cy_dif1.c
index f986e838..3f6b75e4 100755
--- a/src/radix36_ditN_cy_dif1.c
+++ b/src/radix36_ditN_cy_dif1.c
@@ -294,7 +294,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -356,11 +356,11 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -399,7 +399,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -409,7 +409,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -449,24 +449,24 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix36_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix36_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 192 16-byte slots of sc_arr for r-and-s temporaries, next 7 for the nontrivial complex 16th roots,
 	next 36 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the   table lookup stuff,
@@ -537,7 +537,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 2 = 0xa5 [avx] or 0xae [sse2]; This is where the value of half_arr_offset36 comes from
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes */
 
-		ASSERT(HERE, (radix36_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix36_creals_in_local_store checksum failed!");
+		ASSERT((radix36_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix36_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0 );		VEC_DBL_INIT(one, 1.0 );
@@ -880,12 +880,12 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -909,7 +909,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1061,8 +1061,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1072,8 +1072,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1082,26 +1082,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1114,7 +1114,7 @@ for(outer=0; outer <= 1; outer++)
 /******************* AVX debug stuff: *******************/
 #if 0
 	int ipad;
-	ASSERT(HERE, p01 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!");
+	ASSERT(p01 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!");
 	// Use RNG to populate data array:
 	rng_isaac_init(TRUE);
 	double dtmp = 128.0*1024.0*1024.0*1024.0*1024.0;	// 2^47
@@ -1389,7 +1389,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy36_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy36_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1399,7 +1399,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix40_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1855,8 +1855,8 @@ void radix36_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -1953,21 +1953,21 @@ void radix36_dit_pass1(double a[], int n)
 		sse2_rnd= tmp + 0x01;
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes */
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix384_ditN_cy_dif1.c b/src/radix384_ditN_cy_dif1.c
index 5acdadb9..a87b6dff 100755
--- a/src/radix384_ditN_cy_dif1.c
+++ b/src/radix384_ditN_cy_dif1.c
@@ -210,7 +210,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 	// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
-	double *addr, *addi;
+	const double *addr, *addi;
 	struct complex t[RADIX], *tptr;
 	int err;
 	static int first_entry=TRUE;
@@ -310,7 +310,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array:
@@ -376,11 +376,11 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -418,7 +418,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -428,7 +428,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -468,24 +468,24 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix384_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix384_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix384_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -548,8 +548,8 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
-//		ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix384_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix384_creals_in_local_store checksum failed!");
+//		ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix384_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix384_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
@@ -1102,12 +1102,12 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1131,7 +1131,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1283,8 +1283,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1294,8 +1294,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1304,26 +1304,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1336,7 +1336,7 @@ for(outer=0; outer <= 1; outer++)
 /******************* AVX debug stuff: *******************/
 #if 0
 	int ipad;
-	ASSERT(HERE, p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!");
+	ASSERT(p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!");
 	// Use RNG to populate data array:
 	rng_isaac_init(TRUE);
 	double dtmp = 1024.0*1024.0*1024.0*1024.0;
@@ -1608,7 +1608,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy384_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy384_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1618,7 +1618,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2410,8 +2410,8 @@ void radix384_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2690,21 +2690,21 @@ void radix384_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r000 == thread_arg->r000), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r000 == thread_arg->r000), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix384_main_carry_loop.h b/src/radix384_main_carry_loop.h
index 37ea6bad..b835a670 100755
--- a/src/radix384_main_carry_loop.h
+++ b/src/radix384_main_carry_loop.h
@@ -229,8 +229,8 @@ if(tid == 1) {
 		// In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass:
 		if(target_idx == j) {
 		#ifdef USE_SSE2
-			addr = (double *)s1p000 + target_set;
-			*addr += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
+			double *addr_ = (double *)s1p000 + target_set;
+			*addr_ += target_cy*(n>>1);	// target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor
 		#else
 			// target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum:
 			l = target_set&1;	target_set >>= 1;
@@ -529,26 +529,28 @@ if(tid == 1) {
 	  if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) {	// LOACC with tunable DWT-weights chaining
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy;
 		for(l1 = 0; l1 < RADIX>>2; l1++) {
 			jt = j1 + poff[l1]; jp = j2 + poff[l1];	// poff[] = p04,08,...
 			// Re-init weights every 4th macro invocation to keep errors under control:
-			cmplx_carry_norm_errcheck0(a[jt   ],a[jp   ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_fast_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_errcheck0(a[jt   ],a[jp   ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_fast_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  } else {	// HiACC:
 
 		/*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */
-		l = 0; addr = cy; itmp = bjmodn;
+		l = 0; itmp = bjmodn;
+		double *addr_ = cy;
 		for(l1 = 0; l1 < RADIX>>2; l1++) {
 			jt = j1 + poff[l1]; jp = j2 + poff[l1];	// poff[] = p04,08,...
-			cmplx_carry_norm_errcheck0(a[jt   ],a[jp   ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
-			cmplx_carry_norm_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp;
+			cmplx_carry_norm_errcheck0(a[jt   ],a[jp   ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
+			cmplx_carry_norm_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp;
 		}
 
 	  }	// LOACC or HIACC?
diff --git a/src/radix4032_ditN_cy_dif1.c b/src/radix4032_ditN_cy_dif1.c
index 7be1ae8a..538d93d0 100755
--- a/src/radix4032_ditN_cy_dif1.c
+++ b/src/radix4032_ditN_cy_dif1.c
@@ -388,11 +388,11 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -431,7 +431,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -441,7 +441,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -484,24 +484,24 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix4032_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix4032_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -529,13 +529,13 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, half_arr_offset4032 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT(half_arr_offset4032 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) {
 			j = (1<<(2*(L2_SZ_VD-2))) + 4;	// 16+4 for sse2, 64+4 for avx
 		} else {
 			j = ODD_RADIX<<2;				// 4*ODD_RADIX
 		}
-		ASSERT(HERE, (radix4032_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix4032_creals_in_local_store checksum failed!");
+		ASSERT((radix4032_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix4032_creals_in_local_store checksum failed!");
 
 		/* SSE2 math = 53-mantissa-bit IEEE double-float: */
 		VEC_DBL_INIT(sse2_rnd, crnd);
@@ -604,7 +604,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -979,12 +979,12 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1019,7 +1019,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1070,7 +1070,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
 					break;
 				};
 			}	//	printf("wts_idx_incr = %u\n",wts_idx_incr);
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1504,8 +1504,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1515,8 +1515,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1525,26 +1525,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 		#ifdef USE_AVX
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1554,8 +1554,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1776,7 +1776,7 @@ for(outer=0; outer <= 1; outer++)
 	/*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/
 	for(j = 0; j < main_work_units; ++j)
 	{
-		ASSERT(HERE, 0x0 == cy4032_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy4032_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1786,7 +1786,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2885,8 +2885,8 @@ void radix4032_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2945,23 +2945,23 @@ void radix4032_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 	{
 	  #ifdef USE_AVX
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix4032_main_carry_loop.h b/src/radix4032_main_carry_loop.h
index 29ddc950..dcddb954 100755
--- a/src/radix4032_main_carry_loop.h
+++ b/src/radix4032_main_carry_loop.h
@@ -181,7 +181,7 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 						// (and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1).
 			// *But*: since the init macro does an on-the-fly version of this between j,j+2 portions, external code co2=co3 must come *after* both ctmp-data octets are inited.
 		  #ifdef USE_AVX512
-			ASSERT(HERE, 0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!");
+			ASSERT(0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!");
 		  #endif
 			AVX_cmplx_carry_fast_wtsinit_X8(add1,add2,add3, itmp, half_arr,sign_mask, n_minus_sil,n_minus_silp1,sinwt,sinwtm1, sse_bw,sse_n)
 
diff --git a/src/radix40_ditN_cy_dif1.c b/src/radix40_ditN_cy_dif1.c
index 8af575b9..ed0473f0 100755
--- a/src/radix40_ditN_cy_dif1.c
+++ b/src/radix40_ditN_cy_dif1.c
@@ -278,7 +278,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 	}
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 
   #ifdef USE_IMCI512
 //	WARN(HERE, "radix40_ditN_cy_dif1: No k1om / IMCI-512 support; Skipping this leading radix.", "", 1); return(ERR_RADIX0_UNAVAILABLE);
@@ -343,11 +343,11 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -386,7 +386,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -396,7 +396,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -436,24 +436,24 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix40_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix40_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 80 16-byte slots of sc_arr for temporaries, next 5 for the nontrivial complex 16th roots,
 	next 80 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -527,7 +527,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 180 [AVX] or 190 [SSE2]; This is where the value of half_arr_offset40 comes from
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes */
 
-		ASSERT(HERE, (radix40_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix40_creals_in_local_store checksum failed!");
+		ASSERT((radix40_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix40_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
@@ -877,12 +877,12 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -906,7 +906,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1058,8 +1058,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1069,8 +1069,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1079,26 +1079,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1231,7 +1231,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy40_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy40_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1241,7 +1241,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix40_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1665,8 +1665,8 @@ void radix40_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -1769,21 +1769,21 @@ void radix40_dit_pass1(double a[], int n)
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 180 [AVX] or 190 [SSE2]; This is where the value of half_arr_offset40 comes from
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes */
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix44_ditN_cy_dif1.c b/src/radix44_ditN_cy_dif1.c
index 0c61a220..7cb1fbb6 100755
--- a/src/radix44_ditN_cy_dif1.c
+++ b/src/radix44_ditN_cy_dif1.c
@@ -325,7 +325,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -387,11 +387,11 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 	  }
 
 	#ifdef MULTITHREAD
@@ -431,7 +431,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -441,7 +441,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -481,24 +481,24 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix44_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix44_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix44_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x1f) == 0, "sm_ptr not 32-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x1f) == 0, "sm_ptr not 32-byte aligned!");
 
 	/* Use low 88x2 16-byte slots of sc_arr for temporaries, next 21 for the constants needed by the radix-11 DFT,
 	next RADIX/2 = 22 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -552,7 +552,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (radix44_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix44_creals_in_local_store checksum failed!");
+		ASSERT((radix44_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix44_creals_in_local_store checksum failed!");
 	  #if (defined(USE_AVX2) && DFT_11_FMA) || defined(USE_ARM_V8_SIMD)
 	  	/* no-op */
 	  #else
@@ -639,35 +639,35 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 	   #if 0
 		//================================================================
 		// *** Cosine terms: ***
-		qt = qfsub(qfadd(cq0,cq2),qfadd(cq3,cq4));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a0, "a0");	/* a0 = (   cq0      -  cq3+  cq2-  cq4) */
-		qt = qfsub(qfadd(cq1,cq2),qfadd(cq3,cq4));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a1, "a1");	/* a1 = (         cq1-  cq3+  cq2-  cq4) */
-		qt = qfsub(qfadd(cq1,cq2),qfadd(cq0,cq3));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a3, "a3");	/* a3 = (-  cq0+  cq1-  cq3+  cq2      ) */
-		qt = qfsub(qfadd(cq1,cq4),qfadd(cq0,cq3));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a4, "a4");	/* a4 = (-  cq0+  cq1-  cq3      +  cq4) */
-		qt = qfsub(cq2,cq3);						dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a6, "a6");	/* a6 = (            -  cq3+  cq2      ) */
-		qt = qfsub(cq1,cq3);						dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a7, "a7");	/* a7 = (         cq1-  cq3            ) */
-		qt = qfmul(qfifth, qfsub( qfmul_pow2(cq3,2), qfadd(qfadd(cq0,cq1),qfadd(cq2,cq4)) ));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a8, "a8");	/* a8 = (-  cq0-  cq1+4*cq3-  cq2-  cq4)/5 */
-		qt = qfsub( qfmul(qfifth, qfadd( cq0 , qfadd(qfadd(cq1,cq2),qfadd(cq3,cq4)) )), QONE);	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a9, "a9");	/* a9 = (   cq0+  cq1+  cq3+  cq2+  cq4)/5 - 1 */
+		qt = qfsub(qfadd(cq0,cq2),qfadd(cq3,cq4));	dtmp = qfdbl(qt);	ASSERT(dtmp == a0, "a0");	/* a0 = (   cq0      -  cq3+  cq2-  cq4) */
+		qt = qfsub(qfadd(cq1,cq2),qfadd(cq3,cq4));	dtmp = qfdbl(qt);	ASSERT(dtmp == a1, "a1");	/* a1 = (         cq1-  cq3+  cq2-  cq4) */
+		qt = qfsub(qfadd(cq1,cq2),qfadd(cq0,cq3));	dtmp = qfdbl(qt);	ASSERT(dtmp == a3, "a3");	/* a3 = (-  cq0+  cq1-  cq3+  cq2      ) */
+		qt = qfsub(qfadd(cq1,cq4),qfadd(cq0,cq3));	dtmp = qfdbl(qt);	ASSERT(dtmp == a4, "a4");	/* a4 = (-  cq0+  cq1-  cq3      +  cq4) */
+		qt = qfsub(cq2,cq3);						dtmp = qfdbl(qt);	ASSERT(dtmp == a6, "a6");	/* a6 = (            -  cq3+  cq2      ) */
+		qt = qfsub(cq1,cq3);						dtmp = qfdbl(qt);	ASSERT(dtmp == a7, "a7");	/* a7 = (         cq1-  cq3            ) */
+		qt = qfmul(qfifth, qfsub( qfmul_pow2(cq3,2), qfadd(qfadd(cq0,cq1),qfadd(cq2,cq4)) ));	dtmp = qfdbl(qt);	ASSERT(dtmp == a8, "a8");	/* a8 = (-  cq0-  cq1+4*cq3-  cq2-  cq4)/5 */
+		qt = qfsub( qfmul(qfifth, qfadd( cq0 , qfadd(qfadd(cq1,cq2),qfadd(cq3,cq4)) )), QONE);	dtmp = qfdbl(qt);	ASSERT(dtmp == a9, "a9");	/* a9 = (   cq0+  cq1+  cq3+  cq2+  cq4)/5 - 1 */
 		qs = qfadd(qfadd(cq0,cq1), cq2);	qs = qfmul_pow2(qs,1);	// 2*(cq0+cq1+cq2)
 		qt = qfadd(cq3,cq4);	qt = qfadd(qt, qfmul_pow2(qt,1));	// 3*(cq3+cq4)
-		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a2, "a2");	/* a2 = (-2*cq0-2*cq1+3*cq3-2*cq2+3*cq4)/5 */
+		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(dtmp == a2, "a2");	/* a2 = (-2*cq0-2*cq1+3*cq3-2*cq2+3*cq4)/5 */
 		qs = qfadd(qfadd(cq4,cq1), cq2);	qs = qfmul_pow2(qs,1);	// 2*(cq4+cq1+cq2)
 		qt = qfadd(cq3,cq0);	qt = qfadd(qt, qfmul_pow2(qt,1));	// 3*(cq3+cq0)
-		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == a5, "a5");	/* a5 = ( 3*cq0-2*cq1+3*cq3-2*cq2-2*cq4)/5 */
+		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(dtmp == a5, "a5");	/* a5 = ( 3*cq0-2*cq1+3*cq3-2*cq2-2*cq4)/5 */
 		// *** Sine terms: ***
-		qt = qfsub(qfadd(sq0,sq2),qfadd(sq3,sq4));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b0, "b0");	/* b0 = (   sq0      -  sq3+  sq2-  sq4) */
-		qt = qfsub(qfsub(sq2,sq1),qfadd(sq3,sq4));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b1, "b1");	/* b1 = (        -sq1-  sq3+  sq2-  sq4) */
-		qt = qfsub(qfsub(sq2,sq1),qfadd(sq0,sq3));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b3, "b3");	/* b3 = (-  sq0-  sq1-  sq3+  sq2      ) */
-		qt = qfsub(qfsub(sq4,sq1),qfadd(sq0,sq3));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b4, "b4");	/* b4 = (-  sq0-  sq1-  sq3      +  sq4) */
-		qt = qfsub(sq2,sq3);						dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b6, "b6");	/* b6 = (            -  sq3+  sq2      ) */
-		qt = qfneg(qfadd(sq1,sq3));					dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b7, "b7");	/* b7 = (        -sq1-  sq3            ) */
-		qt = qfmul(qfifth, qfsub( qfmul_pow2(sq3,2), qfadd(qfsub(sq0,sq1),qfadd(sq2,sq4)) ));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b8, "b8");	/* b8 = (-  sq0+  sq1+4*sq3-  sq2-  sq4)/5 */
-		qt = qfmul(qfifth, qfadd( sq0 , qfadd(qfsub(sq2,sq1),qfadd(sq3,sq4)) ));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b9, "b9");	/* b9 = (   sq0-  sq1+  sq3+  sq2+  sq4)/5 - 1 */
+		qt = qfsub(qfadd(sq0,sq2),qfadd(sq3,sq4));	dtmp = qfdbl(qt);	ASSERT(dtmp == b0, "b0");	/* b0 = (   sq0      -  sq3+  sq2-  sq4) */
+		qt = qfsub(qfsub(sq2,sq1),qfadd(sq3,sq4));	dtmp = qfdbl(qt);	ASSERT(dtmp == b1, "b1");	/* b1 = (        -sq1-  sq3+  sq2-  sq4) */
+		qt = qfsub(qfsub(sq2,sq1),qfadd(sq0,sq3));	dtmp = qfdbl(qt);	ASSERT(dtmp == b3, "b3");	/* b3 = (-  sq0-  sq1-  sq3+  sq2      ) */
+		qt = qfsub(qfsub(sq4,sq1),qfadd(sq0,sq3));	dtmp = qfdbl(qt);	ASSERT(dtmp == b4, "b4");	/* b4 = (-  sq0-  sq1-  sq3      +  sq4) */
+		qt = qfsub(sq2,sq3);						dtmp = qfdbl(qt);	ASSERT(dtmp == b6, "b6");	/* b6 = (            -  sq3+  sq2      ) */
+		qt = qfneg(qfadd(sq1,sq3));					dtmp = qfdbl(qt);	ASSERT(dtmp == b7, "b7");	/* b7 = (        -sq1-  sq3            ) */
+		qt = qfmul(qfifth, qfsub( qfmul_pow2(sq3,2), qfadd(qfsub(sq0,sq1),qfadd(sq2,sq4)) ));	dtmp = qfdbl(qt);	ASSERT(dtmp == b8, "b8");	/* b8 = (-  sq0+  sq1+4*sq3-  sq2-  sq4)/5 */
+		qt = qfmul(qfifth, qfadd( sq0 , qfadd(qfsub(sq2,sq1),qfadd(sq3,sq4)) ));	dtmp = qfdbl(qt);	ASSERT(dtmp == b9, "b9");	/* b9 = (   sq0-  sq1+  sq3+  sq2+  sq4)/5 - 1 */
 		qs = qfadd(qfsub(sq0,sq1), sq2);	qs = qfmul_pow2(qs,1);	// 2*(sq0-sq1+sq2)
 		qt = qfadd(sq3,sq4);	qt = qfadd(qt, qfmul_pow2(qt,1));	// 3*(sq3+sq4)
-		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b2, "b2");	/* b2 = (-2*sq0+2*sq1+3*sq3-2*sq2+3*sq4)/5 */
+		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(dtmp == b2, "b2");	/* b2 = (-2*sq0+2*sq1+3*sq3-2*sq2+3*sq4)/5 */
 		qs = qfadd(qfsub(sq4,sq1), sq2);	qs = qfmul_pow2(qs,1);	// 2*(sq4-sq1+sq2)
 		qt = qfadd(sq3,sq0);	qt = qfadd(qt, qfmul_pow2(qt,1));	// 3*(sq3+sq0)
-		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(HERE, dtmp == b5, "b5");	/* b5 = ( 3*sq0+2*sq1+3*sq3-2*sq2-2*sq4)/5 */
+		qt = qfmul(qfifth, qfsub(qt,qs));	dtmp = qfdbl(qt);	ASSERT(dtmp == b5, "b5");	/* b5 = ( 3*sq0+2*sq1+3*sq3-2*sq2-2*sq4)/5 */
 		//================================================================
 	   #endif
 	  #endif
@@ -1042,12 +1042,12 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1071,7 +1071,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1223,8 +1223,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1234,8 +1234,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1244,26 +1244,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1396,7 +1396,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy44_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy44_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1406,7 +1406,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix44_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -2065,8 +2065,8 @@ this means that the output permutation translates (in terms of of 4 radix-11 mac
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -2150,21 +2150,21 @@ this means that the output permutation translates (in terms of of 4 radix-11 mac
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix48_ditN_cy_dif1.c b/src/radix48_ditN_cy_dif1.c
index 22eb6cc2..f0d50cba 100755
--- a/src/radix48_ditN_cy_dif1.c
+++ b/src/radix48_ditN_cy_dif1.c
@@ -299,7 +299,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -367,11 +367,11 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -410,7 +410,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -420,7 +420,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -460,24 +460,24 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix48_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix48_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix48_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	#ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -909,12 +909,12 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -938,7 +938,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1090,8 +1090,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1101,8 +1101,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1111,26 +1111,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1263,7 +1263,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy48_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy48_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1273,7 +1273,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -1754,8 +1754,8 @@ void radix48_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -1869,21 +1869,21 @@ void radix48_dit_pass1(double a[], int n)
 		sse2_rnd= tmp + 0x01;	// sc_ptr += 0xe2 = 226; This is where the value of half_arr_offset48 comes from
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, (r00r == thread_arg->r00r), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00r == thread_arg->r00r), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix512_ditN_cy_dif1.c b/src/radix512_ditN_cy_dif1.c
index c1e524f0..ed56d439 100755
--- a/src/radix512_ditN_cy_dif1.c
+++ b/src/radix512_ditN_cy_dif1.c
@@ -56,7 +56,7 @@ void radix512_dif_pass1(double a[], int n)
 	// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
-	double *addr,*addi;
+	const double *addr,*addi;
 	#include "radix1024_twiddles.h"	// Can share radix-1024 table, just use first 31 of 63 rows here
 	struct complex t[RADIX], *tptr;
 
@@ -69,7 +69,7 @@ void radix512_dif_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n >> 9;
 
@@ -350,7 +350,7 @@ void radix512_dit_pass1(double a[], int n)
 	static int poffs[16],po_br[32];
 	// We prefer pointer-based array-element access, because that allows our radix16 DFT-with-twiddles
 	// to look the same in terms of array-element arglists:
-	double *addr,*addi;
+	const double *addr,*addi;
 	struct complex *tptr;
 	#include "radix1024_twiddles.h"
 	// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
diff --git a/src/radix52_ditN_cy_dif1.c b/src/radix52_ditN_cy_dif1.c
index f621ac72..cabb6680 100755
--- a/src/radix52_ditN_cy_dif1.c
+++ b/src/radix52_ditN_cy_dif1.c
@@ -302,7 +302,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
@@ -364,11 +364,11 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -407,7 +407,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -417,7 +417,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -457,24 +457,24 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix52_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix52_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 48 16-byte slots of sc_arr for temporaries, next 2 for the doubled cos and c3m1 terms,
 	next 52/2 = 26 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -524,7 +524,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 	// sc_ptr += 256; This is where the value of half_arr_offset52 comes from
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes for Mersenne-mod, and radixx16 for Fermat-mod */
 	#endif
-		ASSERT(HERE, (radix52_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix52_creals_in_local_store checksum failed!");
+		ASSERT((radix52_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix52_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		tmp = rad13_const-2;		/* __cc pointer offsets: */
 		VEC_DBL_INIT(tmp,  1.0);	++tmp;	/*	-0x020 = 1.0 */
@@ -912,12 +912,12 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -941,7 +941,7 @@ const double cc1=  0.88545602565320989590,	/* Real part of exp(i*2*pi/13), the r
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1093,8 +1093,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1104,8 +1104,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1114,26 +1114,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1266,7 +1266,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy52_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy52_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1276,7 +1276,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix52_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -1735,8 +1735,8 @@ void radix52_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -1818,21 +1818,21 @@ void radix52_dit_pass1(double a[], int n)
 		sse2_rnd= tmp + 0x01;
 		half_arr= tmp + 0x02;	/* This table needs 20x16 bytes for Mersenne-mod, and radixx16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix56_ditN_cy_dif1.c b/src/radix56_ditN_cy_dif1.c
index a7e236a1..6593b47b 100755
--- a/src/radix56_ditN_cy_dif1.c
+++ b/src/radix56_ditN_cy_dif1.c
@@ -421,7 +421,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(first_entry)
 	{
-		ASSERT(HERE, LO_ADD,"LO_ADD");
+		ASSERT(LO_ADD,"LO_ADD");
 		psave = p;	nsave = n;
 		radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)RADIX));
 		n2inv     = qfdbl(qf_rational_quotient((int64)1, (int64)(n/2)));
@@ -447,11 +447,11 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -489,7 +489,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -499,7 +499,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -542,24 +542,24 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of radix56_creals_in_local_store vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix56_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix56_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low slots of sc_arr for temporaries, next few for the nontrivial complex 16th roots,
 	next few for the doubled carry pairs, next 2 for ROE and RND_CONST, next RADIX for the half_arr table lookup stuff,
@@ -755,7 +755,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1147,12 +1147,12 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1187,7 +1187,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1238,7 +1238,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				};
 			}
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1657,8 +1657,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1668,8 +1668,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1678,19 +1678,19 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -1699,11 +1699,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1713,8 +1713,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1936,7 +1936,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy56_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy56_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1946,7 +1946,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2496,8 +2496,8 @@ void radix56_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2636,16 +2636,16 @@ void radix56_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
+		ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!");
 	  #if defined(USE_AVX2) || defined(USE_ARM_V8_SIMD)
 		// AVX2 (i.e. FMA)means non-Nussbaumer radix-7, uses these sincos constants:
-		ASSERT(HERE, (ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!");
+		ASSERT((ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!");
 	  #else
 		/* SSE2 version assumes LO_ADD = 0, i.e. the low-mul Nussbaumer-style DFT implementation: */
-		ASSERT(HERE, (ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!");
+		ASSERT((ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!");
 	  #endif
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2654,15 +2654,15 @@ void radix56_dit_pass1(double a[], int n)
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix60_ditN_cy_dif1.c b/src/radix60_ditN_cy_dif1.c
index c2ddcad7..55c7aae3 100755
--- a/src/radix60_ditN_cy_dif1.c
+++ b/src/radix60_ditN_cy_dif1.c
@@ -457,11 +457,11 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
 	//	printf("0: wt*inv-1 = %15.8e\n",fabs(wts_mult[0]*inv_mult[0] - 1.));
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 	//	printf("1: wt*inv-1 = %15.8e\n",fabs(wts_mult[1]*inv_mult[1] - 1.));
 
 	#ifdef MULTITHREAD
@@ -500,7 +500,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -510,7 +510,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -553,25 +553,25 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread, the latter of which
 		// provide thread-local storage for int-data and tables
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix60_creals_in_local_store + (((12+RADIX/2)/2 + ODD_RADIX + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix60_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 2*RADIX vector-double-sized slots of sc_arr for s1p* temporaries, next 2*RADIX slots for r* temps,
 	next RADIX slots for x and y in-place DFT temps, next 7 for the complex root combs needed for the radix-3 and -5 sub-DFTs,
@@ -705,8 +705,8 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	#endif
 
-		ASSERT(HERE, half_arr_offset60 == (uint32)(half_arr-sc_ptr), "half_arr_offset60 mismatches actual!");
-		ASSERT(HERE, (radix60_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix60_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset60 == (uint32)(half_arr-sc_ptr), "half_arr_offset60 mismatches actual!");
+		ASSERT((radix60_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix60_creals_in_local_store checksum failed!");
 		/* These remain fixed: */
 		VEC_DBL_INIT(sse2_c3m1, c3m1);
 		VEC_DBL_INIT(sse2_s   , s   );
@@ -790,7 +790,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1153,18 +1153,18 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 		if(_cy_r[0])	/* If it's a new exponent of a range test, need to deallocate these. */
 		{
-			ASSERT(HERE, 0 != _i, "free(_i) but ptr = 0x0!");
+			ASSERT(0 != _i, "free(_i) but ptr = 0x0!");
 			for(i= 0; i < RADIX; i++) {
-				ASSERT(HERE, 0 != _bjmodn[i], "free(_bjmodn[i]) but ptr = 0x0!");
-				ASSERT(HERE, 0 !=   _cy_r[i], "free(_cy_r[i]) but ptr = 0x0!");
-				ASSERT(HERE, 0 !=   _cy_i[i], "free(_cy_i[i]) but ptr = 0x0!");
+				ASSERT(0 != _bjmodn[i], "free(_bjmodn[i]) but ptr = 0x0!");
+				ASSERT(0 !=   _cy_r[i], "free(_cy_r[i]) but ptr = 0x0!");
+				ASSERT(0 !=   _cy_i[i], "free(_cy_i[i]) but ptr = 0x0!");
 			}
-			ASSERT(HERE, 0 != _jstart, "free(_jstart) but ptr = 0x0!");
-			ASSERT(HERE, 0 != _jhi, "free(_jhi) but ptr = 0x0!");
-			ASSERT(HERE, 0 != _col, "free(_col) but ptr = 0x0!");
-			ASSERT(HERE, 0 != _co2, "free(_co2) but ptr = 0x0!");
-			ASSERT(HERE, 0 != _co3, "free(_co3) but ptr = 0x0!");
-			ASSERT(HERE, 0 != _bjmodnini, "free(_bjmodnini) but ptr = 0x0!");
+			ASSERT(0 != _jstart, "free(_jstart) but ptr = 0x0!");
+			ASSERT(0 != _jhi, "free(_jhi) but ptr = 0x0!");
+			ASSERT(0 != _col, "free(_col) but ptr = 0x0!");
+			ASSERT(0 != _co2, "free(_co2) but ptr = 0x0!");
+			ASSERT(0 != _co3, "free(_co3) but ptr = 0x0!");
+			ASSERT(0 != _bjmodnini, "free(_bjmodnini) but ptr = 0x0!");
 
 			free((void *)_i     ); _i      = 0x0;
 			for(i = 0; i < RADIX; i++) {
@@ -1198,12 +1198,12 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1238,7 +1238,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1289,7 +1289,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				};
 			}
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1757,8 +1757,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1768,8 +1768,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1778,21 +1778,21 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
-		ASSERT(HERE, ((tmp + 300)->d0 == c3m1 && (tmp + 300)->d1 == c3m1), "thread-local memcheck failed!");
+		ASSERT(((tmp + 300)->d0 == c3m1 && (tmp + 300)->d1 == c3m1), "thread-local memcheck failed!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -1801,11 +1801,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1815,8 +1815,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -2038,7 +2038,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy60_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy60_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -2048,7 +2048,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix32_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -2630,8 +2630,8 @@ void radix60_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2780,7 +2780,7 @@ void radix60_dit_pass1(double a[], int n)
 		y0e   = tmp + 0x1c;
 		tmp += 0x1e;
 
-		ASSERT(HERE, (tmp->d0 == tmp->d1) && (tmp->d0 == c3m1), "thread-local memcheck failed!");
+		ASSERT((tmp->d0 == tmp->d1) && (tmp->d0 == c3m1), "thread-local memcheck failed!");
 		sse2_c3m1 = tmp + 0x00;
 		sse2_s    = tmp + 0x01;
 		sse2_cn1  = tmp + 0x02;
@@ -2815,10 +2815,10 @@ void radix60_dit_pass1(double a[], int n)
 								// +20 = 390 complex, round up to nearby multiple of 4
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2827,15 +2827,15 @@ void radix60_dit_pass1(double a[], int n)
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix60_main_carry_loop.h b/src/radix60_main_carry_loop.h
index 7ecc98f4..298470dd 100755
--- a/src/radix60_main_carry_loop.h
+++ b/src/radix60_main_carry_loop.h
@@ -644,7 +644,7 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 
 		#ifdef USE_AVX512
 			// will never hit this since have same assert in preprocessing code - just a placeholder/reminder:
-			ASSERT(HERE, 0, "radix60_ditN_cy_dif1: No AVX-512 support for Fermat-mod; Skipping this leading radix.");
+			ASSERT(0, "radix60_ditN_cy_dif1: No AVX-512 support for Fermat-mod; Skipping this leading radix.");
 
 		#else	// AVX / AVX2
 
diff --git a/src/radix63_ditN_cy_dif1.c b/src/radix63_ditN_cy_dif1.c
index 41d8438c..5fa18017 100755
--- a/src/radix63_ditN_cy_dif1.c
+++ b/src/radix63_ditN_cy_dif1.c
@@ -265,7 +265,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -275,7 +275,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -315,7 +315,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < 4, "Failed to align cy_dat array!");
+			ASSERT(l < 4, "Failed to align cy_dat array!");
 		}
 	#endif
 
@@ -367,12 +367,12 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		}
 		_maxerr	= (double *)malloc(j);	ptr_prod += (uint32)(_maxerr== 0x0);
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -407,7 +407,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -458,7 +458,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				};
 			}
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 			/* Subtract nwt from the increments to ease fast-mod */
 			wts_idx_incr -= nwt;
@@ -608,8 +608,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -619,8 +619,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = _maxerr[ithread];
@@ -629,11 +629,11 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* init carries	*/
@@ -732,7 +732,7 @@ for(outer=0; outer <= 1; outer++)
 	/*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/
 	for(j = 0; j < main_work_units; ++j)
 	{
-		ASSERT(HERE, 0x0 == cy63_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy63_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -742,7 +742,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
diff --git a/src/radix64_ditN_cy_dif1.c b/src/radix64_ditN_cy_dif1.c
index 52470015..3fbc585e 100755
--- a/src/radix64_ditN_cy_dif1.c
+++ b/src/radix64_ditN_cy_dif1.c
@@ -401,11 +401,11 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -444,7 +444,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -454,7 +454,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -497,23 +497,23 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread
 		cslots_in_local_store = radix64_creals_in_local_store + (20+RADIX/2)/2;	// Just add enough int64 space for both cases, plus some
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix64_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	/* Use low 64 vec_ddl-sized slots of sc_arr for temporaries, next 7 for the nontrivial complex 16th roots,
 	next 32 for the vector carries, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff,
@@ -679,8 +679,8 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
-//		ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix64_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix64_creals_in_local_store checksum failed!");
+//		ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix64_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix64_creals_in_local_store checksum failed!");
 
 	  #if !USE_SCALAR_DFT_MACRO
 		/* These remain fixed: */
@@ -691,7 +691,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		dtmp = *(double *)&isrt2_dn;	VEC_DBL_INIT(isrt2, dtmp);
 		VEC_DBL_INIT(nisrt2,-dtmp);
 		VEC_DBL_INIT( isrt2, dtmp);									// Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround:
-		VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
+		VEC_DBL_INIT( cc0,   1.0);		VEC_DBL_INIT( ss0,   0.0);	//	tmp =  cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2");	Disable to allow "round down" variant
 		VEC_DBL_INIT( cc1, c64_1);		VEC_DBL_INIT( ss1, s64_1);		tmp =  cc1-1; VEC_DBL_INIT(tmp, dtmp);
 		VEC_DBL_INIT( cc2, c32_1);		VEC_DBL_INIT( ss2, s32_1);		tmp =  cc2-1; VEC_DBL_INIT(tmp, dtmp);
 		VEC_DBL_INIT( cc3, c64_3);		VEC_DBL_INIT( ss3, s64_3);		tmp =  cc3-1; VEC_DBL_INIT(tmp, dtmp);
@@ -799,7 +799,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 				qc = qfcos(qt);	qs = qfsin(qt);
 				qx = QONE;		qy = QZRO;
 				for(j = 0; j < RADIX; j++) {
-					printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+					printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 					// Up-multiply the complex exponential:
 					qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 					qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -1198,14 +1198,14 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/radix-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < NDIVR/CY_THREADS; j++)
@@ -1225,7 +1225,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 			if(CY_THREADS > 1)
 			{
 				for(ithread = 1; ithread < CY_THREADS; ithread++)
@@ -1404,8 +1404,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1415,8 +1415,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1425,19 +1425,19 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].r00;
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 
@@ -1447,11 +1447,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1464,8 +1464,8 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_SSE2)
 			// This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots:
-			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d0 * (tmp+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = tmp->d1 * (tmp+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1687,7 +1687,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy64_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy64_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1697,7 +1697,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 //	printf("radix64_ditN_cy_dif1 end  ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks);
 
@@ -2524,8 +2524,8 @@ void radix64_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -2783,10 +2783,10 @@ Workaround: Compiled just this file with -O2, rest with usual -O3.
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -2796,18 +2796,18 @@ Workaround: Compiled just this file with -O2, rest with usual -O3.
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#else	// SSE2:
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		} else {
 		#ifdef USE_AVX512
 			/* No-Op */
 		#else
-			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d0 * (half_arr+1)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (half_arr)->d1 * (half_arr+1)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 		}
 
diff --git a/src/radix768_ditN_cy_dif1.c b/src/radix768_ditN_cy_dif1.c
index edec074a..4c10958a 100755
--- a/src/radix768_ditN_cy_dif1.c
+++ b/src/radix768_ditN_cy_dif1.c
@@ -311,7 +311,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 
 	if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)
 	{
-		ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
+		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
 	// Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array:
@@ -377,11 +377,11 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -419,7 +419,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -429,7 +429,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -469,24 +469,24 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use double-complex type size (16 bytes) to alloc a block of local storage
 		// consisting of radix768_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix768_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix768_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -541,8 +541,8 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
-//		ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix768_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix768_creals_in_local_store checksum failed!");
+//		ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix768_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix768_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(two  , 2.0  );	VEC_DBL_INIT(one, 1.0  );
@@ -1328,12 +1328,12 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays.");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/RADIX-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1357,7 +1357,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 	#ifdef USE_PTHREAD
 		/* Populate the elements of the thread-specific data structs which don't change after init: */
@@ -1509,8 +1509,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1520,8 +1520,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1530,26 +1530,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	  #ifdef USE_AVX512
 			/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	#endif
 		/* init carries: */
@@ -1679,7 +1679,7 @@ for(outer=0; outer <= 1; outer++)
 	for(j = 0; j < main_work_units; ++j)
 	{
 	//	printf("adding main task %d\n",j + pool_work_units);
-		ASSERT(HERE, 0x0 == cy768_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy768_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1689,7 +1689,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2668,8 +2668,8 @@ void radix768_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 
 		/*   constant index offsets for array load/stores are here.	*/
@@ -3107,21 +3107,21 @@ void radix768_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */
 	  #endif
 
-		ASSERT(HERE, (r000 == thread_arg->r000), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r000 == thread_arg->r000), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	  #ifdef USE_AVX512
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix7_ditN_cy_dif1.c b/src/radix7_ditN_cy_dif1.c
index 280e714a..40ad49b6 100755
--- a/src/radix7_ditN_cy_dif1.c
+++ b/src/radix7_ditN_cy_dif1.c
@@ -119,7 +119,7 @@ int radix7_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	if(first_entry)
 	{
-		ASSERT(HERE, LO_ADD,"radix7_ditN_cy_dif1.c: LO_ADD");
+		ASSERT(LO_ADD,"radix7_ditN_cy_dif1.c: LO_ADD");
 		psave = p;	nsave = n;
 		first_entry=FALSE;
 
diff --git a/src/radix8_dif_dit_pass.c b/src/radix8_dif_dit_pass.c
index 0552bb27..fb0ac012 100755
--- a/src/radix8_dif_dit_pass.c
+++ b/src/radix8_dif_dit_pass.c
@@ -104,12 +104,12 @@ void radix8_dif_pass(double a[], int n, struct complex rt0[], struct complex rt1
 	{
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 16 16-byte slots of sc_arr for temporaries, next 16 for the doubled sincos twiddles,
 	next 1 for doubled 1/sqrt2, plus at least 3 more slots to allow for 64-byte alignment of the array:
@@ -191,7 +191,7 @@ void radix8_dif_pass(double a[], int n, struct complex rt0[], struct complex rt1
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
   #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 	c0 = __r0 + thr_id*36 + 0x10;
 	c4    = c0 + 0x02;
 	c2    = c0 + 0x04;
@@ -564,12 +564,12 @@ void radix8_dit_pass(double a[], int n, struct complex rt0[], struct complex rt1
 	{
 		max_threads = init_sse2;
 	#ifndef COMPILER_TYPE_GCC
-		ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+		ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 	#endif
-		ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 
 	/* Use low 16 16-byte slots of sc_arr for temporaries, next 16 for the doubled sincos twiddles,
 	next 1 for doubled 1/sqrt2, plus at least 3 more slots to allow for 64-byte alignment of the array:
@@ -654,7 +654,7 @@ void radix8_dit_pass(double a[], int n, struct complex rt0[], struct complex rt1
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
   #ifdef MULTITHREAD
-	ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+	ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 	c0 = __r0 + thr_id*36 + 0x10;
 	c4    = c0 + 0x02;
 	c2    = c0 + 0x04;
diff --git a/src/radix8_ditN_cy_dif1.c b/src/radix8_ditN_cy_dif1.c
index 3b7414c7..b1c22dde 100755
--- a/src/radix8_ditN_cy_dif1.c
+++ b/src/radix8_ditN_cy_dif1.c
@@ -127,12 +127,12 @@ int radix8_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		if(CY_THREADS > MAX_THREADS)
 			CY_THREADS = MAX_THREADS;
 
-		ASSERT(HERE, CY_THREADS >= NTHREADS,"radix8_ditN_cy_dif1.c: CY_THREADS < NTHREADS");
-		ASSERT(HERE, isPow2(CY_THREADS)    ,"radix8_ditN_cy_dif1.c: CY_THREADS not a power of 2!");
+		ASSERT(CY_THREADS >= NTHREADS,"radix8_ditN_cy_dif1.c: CY_THREADS < NTHREADS");
+		ASSERT(isPow2(CY_THREADS)    ,"radix8_ditN_cy_dif1.c: CY_THREADS not a power of 2!");
 		if(CY_THREADS > 1)
 		{
-			ASSERT(HERE, n8       %CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n8      %CY_THREADS != 0 ... likely more threads than this leading radix can handle.");
-			ASSERT(HERE, n_div_nwt%CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n_div_nwt%CY_THREADS != 0 ... likely more threads than this leading radix can handle.");
+			ASSERT(n8       %CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n8      %CY_THREADS != 0 ... likely more threads than this leading radix can handle.");
+			ASSERT(n_div_nwt%CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n_div_nwt%CY_THREADS != 0 ... likely more threads than this leading radix can handle.");
 		}
 
 	#ifdef MULTITHREAD
@@ -192,47 +192,47 @@ int radix8_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			free((void *)_bjmodnini); _bjmodnini = 0x0;
 		}
 
-		_i       = (int *)malloc(CY_THREADS*sizeof(int)); if(!_i     ) { sprintf(cbuf,"ERROR: unable to allocate array _i       in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn0 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn0){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn1 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn1){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn2 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn2){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn3 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn3){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn4 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn4){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn5 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn5){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn6 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn6){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_bjmodn7 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn7){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_jstart  = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jstart ){ sprintf(cbuf,"ERROR: unable to allocate array _jstart  in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_jhi     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jhi    ){ sprintf(cbuf,"ERROR: unable to allocate array _jhi     in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_col     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_col    ){ sprintf(cbuf,"ERROR: unable to allocate array _col    in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_co2     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co2    ){ sprintf(cbuf,"ERROR: unable to allocate array _co2    in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_co3     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co3    ){ sprintf(cbuf,"ERROR: unable to allocate array _co3    in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-
-		_cy_r0  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r1  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r2  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r3  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r4  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r5  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r6  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_r7  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-
-		_cy_i0  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i1  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i2  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i3  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i4  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i5  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i6  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-		_cy_i7  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
-
-		_maxerr = (double *)malloc(CY_THREADS*sizeof(double)); if(!_maxerr){ sprintf(cbuf,"ERROR: unable to allocate array _maxerr in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_i       = (int *)malloc(CY_THREADS*sizeof(int)); if(!_i     ) { sprintf(cbuf,"ERROR: unable to allocate array _i       in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn0 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn0){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn1 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn1){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn2 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn2){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn3 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn3){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn4 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn4){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn5 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn5){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn6 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn6){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_bjmodn7 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn7){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_jstart  = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jstart ){ sprintf(cbuf,"ERROR: unable to allocate array _jstart  in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_jhi     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jhi    ){ sprintf(cbuf,"ERROR: unable to allocate array _jhi     in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_col     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_col    ){ sprintf(cbuf,"ERROR: unable to allocate array _col    in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_co2     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co2    ){ sprintf(cbuf,"ERROR: unable to allocate array _co2    in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_co3     = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co3    ){ sprintf(cbuf,"ERROR: unable to allocate array _co3    in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+
+		_cy_r0  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r1  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r2  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r3  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r4  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r5  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r6  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_r7  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+
+		_cy_i0  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i1  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i2  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i3  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i4  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i5  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i6  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+		_cy_i7  = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
+
+		_maxerr = (double *)malloc(CY_THREADS*sizeof(double)); if(!_maxerr){ sprintf(cbuf,"ERROR: unable to allocate array _maxerr in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 			/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 			i.e. the one that n2/8-separated FFT outputs need:
 			*/
-			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			_bjmodnini[0] = 0;
 			_bjmodnini[1] = 0;
 			for(j=0; j < n8/CY_THREADS; j++)
@@ -253,7 +253,7 @@ int radix8_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			{
 				bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 			}
-			ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+			ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 		}
 	}	/* endif(first_entry) */
 
@@ -635,7 +635,7 @@ for(outer=0; outer <= 1; outer++)
 
 		for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 		{
-			ASSERT(HERE, CY_THREADS > 1,"radix8_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
+			ASSERT(CY_THREADS > 1,"radix8_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
 			_cy_r0[ithread] = _cy_r0[ithread-1];
 			_cy_r1[ithread] = _cy_r1[ithread-1];
 			_cy_r2[ithread] = _cy_r2[ithread-1];
@@ -673,7 +673,7 @@ for(outer=0; outer <= 1; outer++)
 			// Must use n8 instead of p1 here since p1 may have pads which are not applied to element-2-slots-before
 			j1 = n8-2;	j1 += ( (j1 >> DAT_BITS) << PAD_BITS );
 			j2 = j1+RE_IM_STRIDE;
-			ASSERT(HERE, t15 <= 1.0 && t16 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!");
+			ASSERT(t15 <= 1.0 && t16 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!");
 			// Undo the initial dif pass just for the 16 complex terms in question:
 			RADIX_08_DIT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7]
 						,_t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14,_t15,_t16
@@ -691,11 +691,11 @@ for(outer=0; outer <= 1; outer++)
 			// Verify that any cyout = 1 has the corresponding high word < 0,
 			// then absorb cyout back into the high word and zero the carry:
 			if(t15 == 1.0) {
-				ASSERT(HERE, a[j1+p7] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!");
+				ASSERT(a[j1+p7] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!");
 				a[j1+p7] += FFT_MUL_BASE;	t15 = 0.0;
 			}
 			if(t16 == 1.0) {
-				ASSERT(HERE, a[j2+p7] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!");
+				ASSERT(a[j2+p7] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!");
 				a[j2+p7] += FFT_MUL_BASE;	t16 = 0.0;
 			}
 			// Redo the initial dif pass just for the 16 complex terms in question:
@@ -707,7 +707,7 @@ for(outer=0; outer <= 1; outer++)
 
 		for(ithread = CY_THREADS - 1; ithread > 0; ithread--)
 		{
-			ASSERT(HERE, CY_THREADS > 1,"radix8_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
+			ASSERT(CY_THREADS > 1,"radix8_ditN_cy_dif1.c: ");	/* Make sure loop only gets executed if multiple threads */
 			_cy_r0[ithread] = _cy_r0[ithread-1];		_cy_i0[ithread] = _cy_i0[ithread-1];
 			_cy_r1[ithread] = _cy_r1[ithread-1];		_cy_i1[ithread] = _cy_i1[ithread-1];
 			_cy_r2[ithread] = _cy_r2[ithread-1];		_cy_i2[ithread] = _cy_i2[ithread-1];
diff --git a/src/radix960_ditN_cy_dif1.c b/src/radix960_ditN_cy_dif1.c
index fc7433f5..f9ffe751 100755
--- a/src/radix960_ditN_cy_dif1.c
+++ b/src/radix960_ditN_cy_dif1.c
@@ -484,11 +484,11 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		qt = qfexp(qt);			// ...and get 2^x via exp[x*ln(2)].
 		wts_mult[0] = qfdbl(qt);		// a = 2^(x/n), with x = sw
 		inv_mult[0] = qfdbl(qfinv(qt));	// Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		//curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp:
 		wts_mult[1] = 0.5*wts_mult[0];
 		inv_mult[1] = 2.0*inv_mult[0];
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 
 	#ifdef MULTITHREAD
 
@@ -526,7 +526,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -536,7 +536,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -579,24 +579,24 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!");
+			ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((intptr_t)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix960_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix960_creals_in_local_store);
-		ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -666,8 +666,8 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 32 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, half_arr_offset960 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix960_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix960_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset960 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix960_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix960_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(isrt2,ISRT2);
@@ -759,7 +759,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			qc = qfcos(qt);	qs = qfsin(qt);
 			qx = QONE;		qy = QZRO;
 			for(j = 0; j < RADIX; j++) {
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
@@ -849,7 +849,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 	}		/************************************************************************/
 	else	/*                MODULUS_TYPE_MERSENNE:                                */
 	{		/************************************************************************/
-		ASSERT(HERE, tmp == half_arr, "tmp == half_arr check failed!");
+		ASSERT(tmp == half_arr, "tmp == half_arr check failed!");
 	#ifdef USE_AVX512
 		/* Each lookup-category in the 'mini-tables' used in AVX mode balloons from 16x32-bytes to 64x64-bytes,
 			so switch to an opmask-based scheme which starts with e.g. a broadcast constant and onditional doubling.
@@ -1271,12 +1271,12 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1311,7 +1311,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1362,7 +1362,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				};
 			}	//	printf("wts_idx_incr = %u\n",wts_idx_incr);
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1796,8 +1796,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1807,8 +1807,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1817,21 +1817,21 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
 	  #ifdef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts
-		ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
-		// ASSERT(HERE, ((tmp+0)->d0 == 0.50 && (tmp+0)->d1 == 0.50 && (tmp+0)->d2 == 0.50 && (tmp+0)->d3 == 0.50 && (tmp+0)->d4 == 0.50 && (tmp+0)->d5 == 0.50 && (tmp+0)->d6 == 0.50 && (tmp+0)->d7 == 0.50, "thread-local memcheck failed!");
-		// ASSERT(HERE, ((tmp+1)->d0 == 0.25 && (tmp+1)->d1 == 0.25 && (tmp+1)->d2 == 0.25 && (tmp+1)->d3 == 0.25 && (tmp+1)->d4 == 0.25 && (tmp+1)->d5 == 0.25 && (tmp+1)->d6 == 0.25 && (tmp+1)->d7 == 0.25, "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!");
+		// ASSERT(((tmp+0)->d0 == 0.50 && (tmp+0)->d1 == 0.50 && (tmp+0)->d2 == 0.50 && (tmp+0)->d3 == 0.50 && (tmp+0)->d4 == 0.50 && (tmp+0)->d5 == 0.50 && (tmp+0)->d6 == 0.50 && (tmp+0)->d7 == 0.50, "thread-local memcheck failed!");
+		// ASSERT(((tmp+1)->d0 == 0.25 && (tmp+1)->d1 == 0.25 && (tmp+1)->d2 == 0.25 && (tmp+1)->d3 == 0.25 && (tmp+1)->d4 == 0.25 && (tmp+1)->d5 == 0.25 && (tmp+1)->d6 == 0.25 && (tmp+1)->d7 == 0.25, "thread-local memcheck failed!");
 	  #else
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -1840,11 +1840,11 @@ for(outer=0; outer <= 1; outer++)
 			/* No-Op */
 		#elif defined(USE_AVX)
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1854,8 +1854,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -2076,7 +2076,7 @@ for(outer=0; outer <= 1; outer++)
 	/*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/
 	for(j = 0; j < main_work_units; ++j)
 	{
-		ASSERT(HERE, 0x0 == cy960_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy960_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -2086,7 +2086,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
@@ -2270,7 +2270,7 @@ void radix960_dif_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n/RADIX;
 
@@ -2734,7 +2734,7 @@ void radix960_dit_pass1(double a[], int n)
 
 	if(first_entry)
 	{
-		ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
+		ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!");
 		first_entry=FALSE;
 		NDIVR = n/RADIX;
 
@@ -3410,8 +3410,8 @@ void radix960_dit_pass1(double a[], int n)
 		double *wt1 = thread_arg->wt1;
 		double *wts_mult = thread_arg->wts_mult;	// Const Intra-block wts-multiplier...
 		double *inv_mult = thread_arg->inv_mult;	// ...and 2*(its multiplicative inverse).
-		ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
-		ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!");
+		ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!");
 		int *si = thread_arg->si;
 		struct complex *rn0 = thread_arg->rn0;
 		struct complex *rn1 = thread_arg->rn1;
@@ -3629,10 +3629,10 @@ void radix960_dit_pass1(double a[], int n)
 		half_arr= tmp + 0x02;
 	  #endif
 
-		ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!");
-		ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
+		ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!");
+		ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!");
 	  #ifndef USE_AVX512	// In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts:
-		ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!");
 	  #endif
 		tmp = half_arr;
 	if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
@@ -3641,15 +3641,15 @@ void radix960_dit_pass1(double a[], int n)
 		/* No-Op */
 	  #elif defined(USE_AVX)
 		// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #else	// SSE2:
-		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 	  #endif
 	} else {
-		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+		dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 	}
 
 		VEC_DBL_INIT(max_err, 0.0);
diff --git a/src/radix992_ditN_cy_dif1.c b/src/radix992_ditN_cy_dif1.c
index c826e6e6..ca0efd9b 100755
--- a/src/radix992_ditN_cy_dif1.c
+++ b/src/radix992_ditN_cy_dif1.c
@@ -387,7 +387,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				if(CY_THREADS > 1) {
 					main_work_units = CY_THREADS/2;
 					pool_work_units = CY_THREADS - main_work_units;
-					ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
+					ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!");
 					printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units);
 				} else {
 					main_work_units = 1;
@@ -397,7 +397,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			#else
 
 				pool_work_units = CY_THREADS;
-				ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
+				ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!");
 
 			#endif
 
@@ -438,24 +438,24 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				}
 			}
-			ASSERT(HERE, l < 4, "Failed to align cy_dat array!");
+			ASSERT(l < 4, "Failed to align cy_dat array!");
 		}
 	#endif
 
 	#ifdef USE_SSE2
 
-		ASSERT(HERE, ((uint32)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
-		ASSERT(HERE, ((uint32)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
+		ASSERT(((uint32)wt0    & 0x3f) == 0, "wt0[]  not 64-byte aligned!");
+		ASSERT(((uint32)wt1    & 0x3f) == 0, "wt1[]  not 64-byte aligned!");
 
 		// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
 		// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread
 		// (Add as many padding elts to the latter as needed to make it a multiple of 4):
 		cslots_in_local_store = radix992_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
-		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		sm_ptr = (uint64*)(sc_ptr + radix992_creals_in_local_store);
-		ASSERT(HERE, ((uint32)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
+		ASSERT(((uint32)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
 
 	  #ifdef USE_PTHREAD
 		__r0 = sc_ptr;
@@ -518,8 +518,8 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		// This is where the value of half_arr_offset comes from
 		half_arr= tmp + 0x02;	/* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */
 	  #endif
-		ASSERT(HERE, half_arr_offset992 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
-		ASSERT(HERE, (radix992_creals_in_local_store << L2_SZ_VD) >= ((long)half_arr - (long)r00) + (20 << L2_SZ_VD), "radix992_creals_in_local_store checksum failed!");
+		ASSERT(half_arr_offset992 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!");
+		ASSERT((radix992_creals_in_local_store << L2_SZ_VD) >= ((long)half_arr - (long)r00) + (20 << L2_SZ_VD), "radix992_creals_in_local_store checksum failed!");
 
 		/* These remain fixed: */
 		VEC_DBL_INIT(isrt2,ISRT2);
@@ -600,7 +600,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 				// Up-multiply the complex exponential:
 				qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt);	// Store qxnew in qmul for now.
 				qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy   = qfadd(qn, qt); qx = qmul;
-				printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx));
+				printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx));
 			}
 			exit(0);
 		#endif
@@ -1109,12 +1109,12 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 			_cy_i[i]	= (double *)malloc(j);	ptr_prod += (uint32)(_cy_i[i]== 0x0);
 		}
 
-		ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
+		ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!");
 
 		/* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment,
 		i.e. the one that n2/radix-separated FFT outputs need:
 		*/
-		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		_bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int));	if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		_bjmodnini[0] = 0;
 		_bjmodnini[1] = 0;
 
@@ -1149,7 +1149,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 		{
 			bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n);
 		}
-		ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
+		ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini");
 
 		// In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops:
 		if(CY_THREADS > 1)
@@ -1200,7 +1200,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[]
 					break;
 				};
 			}
-			ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!");
+			ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!");
 
 		#ifdef USE_SSE2
 			wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3);	/* In the SIMD version, use icycle0-6 as actual address
@@ -1585,8 +1585,8 @@ for(outer=0; outer <= 1; outer++)
 	{
 		tdat[ithread].iter = iter;
 	// int data:
-		ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!");
 
 		tdat[ithread].khi    = khi;
 		tdat[ithread].i      = _i[ithread];	/* Pointer to the BASE and BASEINV arrays.	*/
@@ -1596,8 +1596,8 @@ for(outer=0; outer <= 1; outer++)
 		tdat[ithread].col = _col[ithread];
 		tdat[ithread].co2 = _co2[ithread];
 		tdat[ithread].co3 = _co3[ithread];
-		ASSERT(HERE, tdat[ithread].sw  == sw, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].sw  == sw, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!");
 
 	// double data:
 		tdat[ithread].maxerr = 0.0;
@@ -1606,26 +1606,26 @@ for(outer=0; outer <= 1; outer++)
 
 	// pointer data:
 		tdat[ithread].arrdat = a;			/* Main data array */
-		ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].si  == si, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].si  == si, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!");
 	#ifdef USE_SSE2
-		ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
-		ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!");
+		ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!");
 		tmp = tdat[ithread].half_arr;
-		ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
+		ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!");
 	#endif
 		if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE)
 		{
 		#ifdef USE_AVX
 			// Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further]
-			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d0 * (tmp+56)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+40)->d1 * (tmp+56)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#elif defined(USE_SSE2)
-			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d0 * (tmp+14)->d0;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp+10)->d1 * (tmp+14)->d1;	ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1635,8 +1635,8 @@ for(outer=0; outer <= 1; outer++)
 		else	/* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */
 		{
 		#ifdef USE_SSE2
-			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
-			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
+			dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1;	ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!");
 		#endif
 			/* init carries	*/
 			for(i = 0; i < RADIX; i++) {
@@ -1795,7 +1795,7 @@ for(outer=0; outer <= 1; outer++)
 	/*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/
 	for(j = 0; j < main_work_units; ++j)
 	{
-		ASSERT(HERE, 0x0 == cy992_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
+		ASSERT(0x0 == cy992_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!");
 	}
 
   #endif
@@ -1805,7 +1805,7 @@ for(outer=0; outer <= 1; outer++)
 	ns_time.tv_nsec = 100000;	// (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here
 
 	while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) {
-		ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
+		ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!");
 	}
 
 	/* Copy the thread-specific output carry data back to shared memory: */
diff --git a/src/rng_isaac.c b/src/rng_isaac.c
index 6110639f..1a96da9b 100755
--- a/src/rng_isaac.c
+++ b/src/rng_isaac.c
@@ -162,8 +162,8 @@ double	rng_isaac_rand_double_norm_pos()
 	/* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */
 	if(retval < 0.0 || retval > 1.0)
 	{
-		sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16llx, iran64 = %16llx, retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
-		ASSERT(HERE, 0, cbuf);
+		sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
+		ASSERT(0, cbuf);
 	}
 	return retval;
 }
@@ -193,8 +193,8 @@ double	rng_isaac_rand_double_norm_pm1()
 	/* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */
 	if(retval < -1.0 || retval > 1.0)
 	{
-		sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16llx, iran64 = %16llx, retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
-		ASSERT(HERE, 0, cbuf);
+		sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval);
+		ASSERT(0, cbuf);
 	}
 	return retval;
 }
diff --git a/src/test_fft_radix.c b/src/test_fft_radix.c
index 3036263e..fa3562e4 100755
--- a/src/test_fft_radix.c
+++ b/src/test_fft_radix.c
@@ -324,43 +324,43 @@ void test_fft_radix(void)
 	index        = ALLOC_INT(index       , RADIX);
 	dit_scramble = ALLOC_INT(dit_scramble, RADIX);
 	/* double a[rmul*RADIX], b[rmul*RADIX], arrtmp[rmul*RADIX]: */
-	ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX);	ASSERT(HERE, (ptmp != 0x0), "FATAL: unable to allocate array A in test_fft_radix.\n");
+	ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX);	ASSERT((ptmp != 0x0), "FATAL: unable to allocate array A in test_fft_radix.\n");
 	a    = ALIGN_DOUBLE(ptmp);	ptmp = 0x0;
 	ac = (struct complex *)a;
-	ASSERT(HERE, ((long)((void *)a) & 63) == 0x0,"test_fft_radix: A[] not aligned on 64-byte boundary!");
-	ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX);	ASSERT(HERE, (ptmp != 0x0), "FATAL: unable to allocate array B in test_fft_radix.\n");
+	ASSERT(((long)((void *)a) & 63) == 0x0,"test_fft_radix: A[] not aligned on 64-byte boundary!");
+	ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX);	ASSERT((ptmp != 0x0), "FATAL: unable to allocate array B in test_fft_radix.\n");
 	b    = ALIGN_DOUBLE(ptmp);	ptmp = 0x0;
-	ASSERT(HERE, ((long)((void *)b) & 63) == 0x0,"test_fft_radix: B[] not aligned on 64-byte boundary!");
+	ASSERT(((long)((void *)b) & 63) == 0x0,"test_fft_radix: B[] not aligned on 64-byte boundary!");
 	bc = (struct complex *)b;
-	ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX);	ASSERT(HERE, (ptmp != 0x0), "FATAL: unable to allocate array A_ptmp in test_fft_radix.\n");
+	ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX);	ASSERT((ptmp != 0x0), "FATAL: unable to allocate array A_ptmp in test_fft_radix.\n");
 	arrtmp = ALIGN_DOUBLE(ptmp);	ptmp = 0x0;
-	ASSERT(HERE, ((long)((void *)arrtmp) & 63) == 0x0,"test_fft_radix: arrtmp[] not aligned on 64-byte boundary!");
+	ASSERT(((long)((void *)arrtmp) & 63) == 0x0,"test_fft_radix: arrtmp[] not aligned on 64-byte boundary!");
 	/* struct complex mat[radix][RADIX], *matp[RADIX]: */
-	ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX);	ASSERT(HERE, (ctmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n");
+	ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX);	ASSERT((ctmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n");
 	matp  = ALIGN_POINTER(ctmpp,struct complex*);
-	ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX);	ASSERT(HERE, (ctmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n");
+	ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX);	ASSERT((ctmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n");
 	mat   = ALIGN_POINTER(ctmpp,struct complex*);
 	for(i = 0; i < RADIX; ++i) {
-		ctmp = ALLOC_COMPLEX(ctmp, RADIX);	ASSERT(HERE, (ctmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n");
+		ctmp = ALLOC_COMPLEX(ctmp, RADIX);	ASSERT((ctmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n");
 		mat[i] = ALIGN_COMPLEX(ctmp);
 		ctmp = 0x0;	/* Must re-init pointer so the realloc used by the ALLOC macro allocates new fresh memory for each row */
 	}
   #ifdef USE_FGT61
-	iptr = ALLOC_UINT64(iptr, rmul*RADIX);	ASSERT(HERE, (iptr != 0x0), "FATAL: unable to allocate array AMOD in test_fft_radix.\n");
+	iptr = ALLOC_UINT64(iptr, rmul*RADIX);	ASSERT((iptr != 0x0), "FATAL: unable to allocate array AMOD in test_fft_radix.\n");
 	amod = ALIGN_UINT64(iptr);	iptr = 0x0;
 	am = (uint128 *)amod;
-	ASSERT(HERE, ((long)((void *)amod) & 63) == 0x0,"test_fft_radix: AMOD[] not aligned on 64-byte boundary!");
-	iptr = ALLOC_UINT64(iptr, rmul*RADIX);	ASSERT(HERE, (iptr != 0x0), "FATAL: unable to allocate array BMOD in test_fft_radix.\n");
+	ASSERT(((long)((void *)amod) & 63) == 0x0,"test_fft_radix: AMOD[] not aligned on 64-byte boundary!");
+	iptr = ALLOC_UINT64(iptr, rmul*RADIX);	ASSERT((iptr != 0x0), "FATAL: unable to allocate array BMOD in test_fft_radix.\n");
 	bmod = ALIGN_UINT64(iptr);	iptr = 0x0;
-	ASSERT(HERE, ((long)((void *)bmod) & 63) == 0x0,"test_fft_radix: BMOD[] not aligned on 64-byte boundary!");
+	ASSERT(((long)((void *)bmod) & 63) == 0x0,"test_fft_radix: BMOD[] not aligned on 64-byte boundary!");
 	bm = (uint128 *)bmod;
-	iptr = ALLOC_UINT64(iptr, rmul*RADIX);	ASSERT(HERE, (iptr != 0x0), "FATAL: unable to allocate array A_iptr in test_fft_radix.\n");
-	itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX);	ASSERT(HERE, (itmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n");
+	iptr = ALLOC_UINT64(iptr, rmul*RADIX);	ASSERT((iptr != 0x0), "FATAL: unable to allocate array A_iptr in test_fft_radix.\n");
+	itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX);	ASSERT((itmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n");
 	matmodp  = ALIGN_POINTER(itmpp,uint128*);
-	itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX);	ASSERT(HERE, (itmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n");
+	itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX);	ASSERT((itmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n");
 	matmod   = ALIGN_POINTER(itmpp,uint128*);
 	for(i = 0; i < RADIX; ++i) {
-		itmp = ALLOC_UINT128(itmp, RADIX);	ASSERT(HERE, (itmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n");
+		itmp = ALLOC_UINT128(itmp, RADIX);	ASSERT((itmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n");
 		matmod[i] = ALIGN_UINT128(itmp);
 		itmp = 0x0;	/* Must re-init pointer so the realloc used by the ALLOC macro allocates new fresh memory for each row */
 	}
@@ -371,8 +371,8 @@ void test_fft_radix(void)
 	/* Power-of-2 component of the DFT length: */
 	pow2 = 1 << trailz32(RADIX);
 	podd = RADIX >> trailz32(RADIX);
-	ASSERT(HERE, RADIX == pow2*podd, "Radix decomposition failed!");
-	ASSERT(HERE, (podd < 16 || podd == 31 || podd == 63), "test_fft_radix: Illegal radix; must be odd*2^n with odd = [3,5,7,9,11,13,15,31,63]");
+	ASSERT(RADIX == pow2*podd, "Radix decomposition failed!");
+	ASSERT((podd < 16 || podd == 31 || podd == 63), "test_fft_radix: Illegal radix; must be odd*2^n with odd = [3,5,7,9,11,13,15,31,63]");
 	/* These may not have been init'ed yet, so do it here: */
 	DAT_BITS = DAT_BITS_DEF;
 	PAD_BITS = PAD_BITS_DEF;
@@ -410,14 +410,14 @@ void test_fft_radix(void)
 #ifdef USE_FGT61
 	order = RADIX;	prim_root_q(order, &root_re,&root_im);	// RADIXth primitive root of unity
 	// primitive 16th root of unity, scaled by *8:
-	ASSERT(HERE, root_re == 1693317751237720973ull && root_im == 2283815672160731785ull,"Bad prim-root[16]!");;
+	ASSERT(root_re == 1693317751237720973ull && root_im == 2283815672160731785ull,"Bad prim-root[16]!");;
 #endif
 	for(i = 0; i < RADIX; i++)
 	{
 		theta = i*twopi/RADIX;
 	#ifdef USE_FGT61
 		pow_modq((uint64)i, root_re,root_im, &m0,&m1);	// m0,m1 = Ith power of prim-root
-		if(i == 0) ASSERT(HERE, m0 == 1ull && m1 == 0ull, "Bad 0th power of prim-root!");
+		if(i == 0) ASSERT(m0 == 1ull && m1 == 0ull, "Bad 0th power of prim-root!");
 		rm = 1ull;	im = 0ull;	// leftmost col has [m0,m1]^0 = [1,0]...
 	//	printf("DFT-int matrix row %d:\n",i);
 	#endif
@@ -429,7 +429,7 @@ void test_fft_radix(void)
 		#ifdef USE_FGT61
 			matmod[i][j].d0 = rm;
 			matmod[i][j].d1 = im;
-	//	printf("\t[%2d] = %20llu, %20llu\n",j, rm,im);
+	//	printf("\t[%2d] = %20" PRIu64 ", %20" PRIu64 "\n",j, rm,im);
 			cmul_modq(m0,m1, rm,im, &rm,&im);	// ... [j]col has [m0,m1]^j
 			rm = qreduce_full(rm);	im = qreduce_full(im);
 		#endif
@@ -443,8 +443,8 @@ void test_fft_radix(void)
 #ifdef USE_SSE2
 	/* In SSE2 mode re-Init data array, using [re,re,im,im] data layout: */
 	i = RADIX-1;	i = i + ( (i >> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
-	ASSERT(HERE, i == RADIX-1, "for large radix, need to enable array padding in indexing here!!");
-	ASSERT(HERE, rmul == 4, "!");
+	ASSERT(i == RADIX-1, "for large radix, need to enable array padding in indexing here!!");
+	ASSERT(rmul == 4, "!");
 	for(i = 0; i < RADIX ; i++)
 	{
 		a[ 2*i   *RE_IM_STRIDE  ] = ref[2*i  ];
@@ -719,7 +719,7 @@ void test_fft_radix(void)
 			l = (l - podd)%RADIX;	if(l < 0) { l += RADIX; }
 		}
 	} else {	// Odd-prime or odd-prime-power radix
-		ASSERT(HERE, (nradices == 1 && (radix_prim[0]&1))
+		ASSERT((nradices == 1 && (radix_prim[0]&1))
 				|| (nradices > 1 && (radix_prim[0] == radix_prim[1])), "Unexpected radix-decomposition!");
 	}
 
@@ -957,7 +957,7 @@ void test_fft_radix(void)
 	{
 		j = index[i];
 	#ifdef USE_SSE2
-		ASSERT(HERE, a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!");
+		ASSERT(a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!");
 	#endif
 		j1 =  2*i   *RE_IM_STRIDE;	// Real part
 		j2 = (2*i+1)*RE_IM_STRIDE;	// Imag part
@@ -971,10 +971,10 @@ void test_fft_radix(void)
 		// We only deploy FGT-based DFTs once the floating version has been tested, so no point doing the sorting
 		// here, just compare using the (presumably correct) output-index permutations derived for the float code:
 	#ifdef USE_FGT61
-		printf("I = %3u: DIF-ref: %20llu  %20llu,  FGT: %20llu  %20llu",i, bmod[2*j],bmod[2*j+1], amod[j1],amod[j2]);
+		printf("I = %3u: DIF-ref: %20" PRIu64 "  %20" PRIu64 ",  FGT: %20" PRIu64 "  %20" PRIu64,i, bmod[2*j],bmod[2*j+1], amod[j1],amod[j2]);
 		if(bmod[2*j] != amod[j1] || bmod[2*j+1] != amod[j2]) {
 			if(bmod[2*j] != qreduce_full(amod[j1]) || bmod[2*j+1] != qreduce_full(amod[j2])) {
-				printf("\tDiff = %20lld  %20lld\n",bmod[2*j]-amod[j1], bmod[2*j+1]-amod[j2]);
+				printf("\tDiff = %20" PRId64 "  %20" PRId64 "\n",bmod[2*j]-amod[j1], bmod[2*j+1]-amod[j2]);
 			} else {
 				printf("\tMatch (mod q)\n");
 			}
@@ -1058,7 +1058,7 @@ void test_fft_radix(void)
 	avgerr *= iradix;
 	printf("test_fft_radix: %d Mismatches detected in DIF DFT; maxerr = %15.10e, avgerr = %15.10e\n", nerr, maxerr, avgerr);
 	printf("\n");
-	ASSERT(HERE, nerr == 0, "test_fft_radix: Mismatches detected in DIF transform!");
+	ASSERT(nerr == 0, "test_fft_radix: Mismatches detected in DIF transform!");
 
 #endif
 
@@ -1082,12 +1082,12 @@ void test_fft_radix(void)
 /*printf("J = [%3d]: add %6d, %6d\n",j,(int)a[2*j  ],(int)a[2*j+1]);*/
 	}
 /*printf("sum[Re,Im] = %15.5f  %15.5f\n",t0,t2);*/
-	ASSERT(HERE, t0==t1 && t2==t3, "!");
+	ASSERT(t0==t1 && t2==t3, "!");
 	for(i = 0; i < rmul*RADIX ; i+=2)
 	{
 		a[i  ] = arrtmp[i  ];
 		a[i+1] = arrtmp[i+1];
-		ASSERT(HERE, a[i  ] == a[i+1], "!");
+		ASSERT(a[i  ] == a[i+1], "!");
 	}
 
   #else
@@ -1113,7 +1113,7 @@ void test_fft_radix(void)
 		// Since we negated Im-part above, must analogize to q - (pre-negation)a[j2] here:
 		amod[j1] =             a[j1];	// Here, the cast-to-uint64 is implied by the assignment ...
 		amod[j2] = q + (uint64)a[j2];	// ...but here need explicit cast to ensure integer addition.
-		printf("DIT-in[%2u]: float = [%10.5f,%10.5f]; int = [ %llu, q - %llu]\n",i, a[j1],a[j2] ,amod[j1],q - amod[j2]);
+		printf("DIT-in[%2u]: float = [%10.5f,%10.5f]; int = [ %" PRIu64 ", q - %" PRIu64 "]\n",i, a[j1],a[j2] ,amod[j1],q - amod[j2]);
 	#endif
 	}
 
@@ -1290,10 +1290,10 @@ void test_fft_radix(void)
 		// here, just compare using the (presumably correct) output-index permutations derived for the float code:
 	#ifdef USE_FGT61
 		// Flip sign on Im-part of ref-outputs:
-		printf("I = %3u: DIT-ref: %20llu  %20llu,  FGT: %20llu  %20llu",i, bmod[2*i],q-bmod[2*i+1], amod[j1],amod[j2]);
+		printf("I = %3u: DIT-ref: %20" PRIu64 "  %20" PRIu64 ",  FGT: %20" PRIu64 "  %20" PRIu64,i, bmod[2*i],q-bmod[2*i+1], amod[j1],amod[j2]);
 		if(bmod[2*i] != amod[j1] || q-bmod[2*i+1] != amod[j2]) {
 			if(bmod[2*i] != qreduce_full(amod[j1]) || q-bmod[2*i+1] != qreduce_full(amod[j2])) {
-				printf("\tDiff = %20lld  %20lld\n",bmod[2*i]-amod[j1], (q-bmod[2*i+1])-amod[j2]);
+				printf("\tDiff = %20" PRId64 "  %20" PRId64 "\n",bmod[2*i]-amod[j1], (q-bmod[2*i+1])-amod[j2]);
 			} else {
 				printf("\tMatch (mod q)\n");
 			}
@@ -1369,7 +1369,7 @@ void test_fft_radix(void)
 	avgerr *= iradix;
 	printf("test_fft_radix: %d Mismatches detected in DIT DFT; maxerr = %15.10e, avgerr = %15.10e\n", nerr, maxerr, avgerr);
 	printf("\n");
-	ASSERT(HERE, nerr == 0, "test_fft_radix: Mismatches detected in DIT transform!");
+	ASSERT(nerr == 0, "test_fft_radix: Mismatches detected in DIT transform!");
 
 #endif
 
@@ -1380,7 +1380,7 @@ void test_fft_radix(void)
 	for(i = 0; i < RADIX ; i++)
 	{
 	#ifdef USE_SSE2
-		ASSERT(HERE, a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!");
+		ASSERT(a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!");
 	#endif
 		j1 =  2*i   *RE_IM_STRIDE;
 		j2 = (2*i+1)*RE_IM_STRIDE;
@@ -1396,10 +1396,10 @@ void test_fft_radix(void)
 			printf("%4d  %25.15f  %25.15f, ERR= %15.10e\n",i,a[j1], a[j2], CABS(err_r, err_i));
 		}
 	#ifdef USE_FGT61
-		printf("I = %3u: DIF+DIT ref: [%lld,%lld],  FGT: [%20llu,%20llu]",i, (uint64)arrtmp[2*i],(uint64)arrtmp[2*i+1], amod[j1]/RADIX,amod[j2]/RADIX);
+		printf("I = %3u: DIF+DIT ref: [%" PRId64 ",%" PRId64 "],  FGT: [%20" PRIu64 ",%20" PRIu64 "]",i, (uint64)arrtmp[2*i],(uint64)arrtmp[2*i+1], amod[j1]/RADIX,amod[j2]/RADIX);
 		if((uint64)arrtmp[2*i] != amod[j1]/RADIX || (uint64)arrtmp[2*i+1] != amod[j2]/RADIX) {
 			if((uint64)arrtmp[2*i] != qreduce_full(amod[j1])/RADIX || (uint64)arrtmp[2*i+1] != qreduce_full(amod[j2])/RADIX) {
-				printf("\tMismatch! mod-outputs (mod RADIX) = [%20llu,%20llu]\n",amod[j1]%RADIX, amod[j2]%RADIX);
+				printf("\tMismatch! mod-outputs (mod RADIX) = [%20" PRIu64 ",%20" PRIu64 "]\n",amod[j1]%RADIX, amod[j2]%RADIX);
 			} else {
 				printf("\tMatch (mod q)\n");
 			}
@@ -1411,7 +1411,7 @@ void test_fft_radix(void)
 	avgerr *= iradix;
 	printf("test_fft_radix: %d Mismatches detected in DIF/DIT combo; maxerr = %15.10e, avgerr = %15.10e\n", nerr, maxerr, avgerr);
 	printf("\n");
-	ASSERT(HERE, nerr == 0, "test_fft_radix: Mismatches detected in DIF/DIT combo!");
+	ASSERT(nerr == 0, "test_fft_radix: Mismatches detected in DIF/DIT combo!");
 
 #endif
 	printf("");
@@ -1481,7 +1481,7 @@ void matmul_fgtmod(uint128 **mat, uint128 vec_in[], uint128 vec_out[], int nrow,
 			cmul_modq(mat[i][j].d0,mat[i][j].d1, vec_in[j].d0,vec_in[j].d1, &rm,&im);
 			// CMUL_MODQ outputs in 0,4b - must feed to qreduce() prior to accumulating:
 			rm = qreduce(rm);	im = qreduce(im);
-	//	if(!i) printf("\t[%2d] = [%llu,%llu] * [%llu,%llu] = [%llu,%llu]\n",j, mat[i][j].d0,mat[i][j].d1, vec_in[j].d0,vec_in[j].d1, rm,im);
+	//	if(!i) printf("\t[%2d] = [%" PRIu64 ",%" PRIu64 "] * [%" PRIu64 ",%" PRIu64 "] = [%" PRIu64 ",%" PRIu64 "]\n",j, mat[i][j].d0,mat[i][j].d1, vec_in[j].d0,vec_in[j].d1, rm,im);
 			rm += vec_out[i].d0;
 			im += vec_out[i].d1;
 			// Normalize to ensure accumulated sum in [0,q-1]:
diff --git a/src/threadpool.c b/src/threadpool.c
index 94fef552..4dde1b9e 100644
--- a/src/threadpool.c
+++ b/src/threadpool.c
@@ -266,7 +266,7 @@ me at: heber.tomer@gmail.com
 	#endif
 	static void *worker_thr_routine(void *data)
 	{
-		char cbuf[STR_MAX_LEN];
+		char cbuf[STR_MAX_LEN*2];
 	#if INCLUDE_HWLOC
 		char str[80];
 	#endif
@@ -293,7 +293,7 @@ me at: heber.tomer@gmail.com
 		i = mi64_ith_set_bit(CORE_SET, i+1, MAX_CORES>>6);	// Remember, [i]th-bit index in arglist is *unit* offset, i.e. must be in [1,MAX_CORES]
 		if(i < 0) {
 			fprintf(stderr,"Affinity CORE_SET does not have a [%u]th set bit!",my_id % pool->num_of_cores);
-			ASSERT(HERE, 0, "Aborting.");
+			ASSERT(0, "Aborting.");
 		}
 		CPU_SET(i, &cpu_set);
 		errcode = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof (cpu_set), &cpu_set);
@@ -338,7 +338,7 @@ me at: heber.tomer@gmail.com
 		i = mi64_ith_set_bit(CORE_SET, i+1, MAX_CORES>>6);	// Remember, [i]th-bit index in arglist is *unit* offset, i.e. must be in [1,MAX_CORES]
 		if(i < 0) {
 			fprintf(stderr,"Affinity CORE_SET does not have a [%u]th set bit!",my_id % pool->num_of_cores);
-			ASSERT(HERE, 0, "Aborting.");
+			ASSERT(0, "Aborting.");
 		}
 
 	 #if INCLUDE_HWLOC
@@ -349,14 +349,14 @@ me at: heber.tomer@gmail.com
 		if (obj) {
 			hwloc_bitmap_or(cpuset, cpuset, obj->cpuset);
 		} else {
-			snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: HWLOC_OBJ_PU[%u] not found.\n",i);
+			snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: HWLOC_OBJ_PU[%u] not found.\n",i);
 			fprintf(stderr,"%s",cbuf);
 		}
 		// Set affinity to specified logical CPUs:
 		if (hwloc_set_cpubind(hw_topology, cpuset, HWLOC_CPUBIND_THREAD)) {
 			int error = errno;
 			hwloc_bitmap_snprintf (str, sizeof (str), cpuset);
-			snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Warning: Unable to set affinity to cpuset %s: %s; leaving up to OS to manage thread/core binding.\n",str,strerror(error));
+			snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Warning: Unable to set affinity to cpuset %s: %s; leaving up to OS to manage thread/core binding.\n",str,strerror(error));
 			fprintf(stderr,"%s",cbuf);
 	  #if THREAD_POOL_DEBUG
 		} else {
@@ -401,7 +401,7 @@ me at: heber.tomer@gmail.com
 		i = mi64_ith_set_bit(CORE_SET, i+1, MAX_CORES>>6);	// Remember, [i]th-bit index in arglist is *unit* offset, i.e. must be in [1,MAX_CORES]
 		if(i < 0) {
 			fprintf(stderr,"Affinity CORE_SET does not have a [%u]th set bit!",my_id % pool->num_of_cores);
-			ASSERT(HERE, 0, "Aborting.");
+			ASSERT(0, "Aborting.");
 		}
 		apolicy.affinity_tag = i; // set affinity tag
 	  #if THREAD_POOL_DEBUG
diff --git a/src/twopmodq.c b/src/twopmodq.c
index e4b3ae09..4c8f4349 100755
--- a/src/twopmodq.c
+++ b/src/twopmodq.c
@@ -46,7 +46,7 @@ uint64 test_modsqr64(uint64 x, uint64 q)
 	uint64 qinv,t,hi,lo;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q & 0x1, "q must be odd!");
+	ASSERT(q & 0x1, "q must be odd!");
 	qinv = (q+q+q) ^ (uint64)2;
 	for(j = 0; j < 4; j++)
 	{
@@ -92,7 +92,7 @@ uint96 test_modsqr96(uint96 x, uint96 q)
 #endif
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q.d0 & 0x1, "q must be odd!");
+	ASSERT(q.d0 & 0x1, "q must be odd!");
 	/* Init qinv = q. Since we're only interested in the bottom 3 bits of q, can use 64-bit math for that:*/
 	qinv.d0 = q.d0;	qinv.d1 = (uint64)0;
 
@@ -134,7 +134,7 @@ uint128 test_modsqr128(uint128 x, uint128 q)
 	uint128 qinv,t,hi,lo;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q.d0 & 0x1, "q must be odd!");
+	ASSERT(q.d0 & 0x1, "q must be odd!");
 	/* Init qinv = q. Since we're only interested in the bottom 3 bits of q, can use 64-bit math for that:*/
 	qinv.d0 = (q.d0+q.d0+q.d0) ^ (uint64)2;
 	qinv.d1 = (uint64)0;
@@ -178,7 +178,7 @@ uint128 test_modsqr128_96(uint128 x, uint128 q)
 	uint128 qinv,t,lo;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q.d0 & 0x1, "q must be odd!");
+	ASSERT(q.d0 & 0x1, "q must be odd!");
 	/* Init qinv = q. Since we're only interested in the bottom 3 bits of q, can use 64-bit math for that:*/
 	qinv.d0 = (q.d0+q.d0+q.d0) ^ (uint64)2;
 	qinv.d1 = (uint64)0;
@@ -233,7 +233,7 @@ uint32 test_twopmodq64(uint32 imax)
 	  #endif
 		mi64_div(prod128, &q, 2,1, 0x0,&rem);	// Omit quotient computation; remainder in rem
 		if(rem != 1) {
-			fprintf(stderr,"Mismatch in test_twopmodq64: p = %llu; q = %llu: 2^[+|-p] (mod q) = %llu, %llu.\n",p,q,pos,neg);
+			fprintf(stderr,"Mismatch in test_twopmodq64: p = %" PRIu64 "; q = %" PRIu64 ": 2^[+|-p] (mod q) = %" PRIu64 ", %" PRIu64 ".\n",p,q,pos,neg);
 			return 1;
 		}
 	}
@@ -386,7 +386,7 @@ uint64 twopmodq63(uint64 p, uint64 q)
 
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q & 0x1, "q must be odd!");
+	ASSERT(q & 0x1, "q must be odd!");
 
 	qinv = (q+q+q) ^ (uint64)2;
 	for(j = 0; j < 4; j++)
@@ -408,7 +408,7 @@ uint64 twopmodq63(uint64 p, uint64 q)
 	{
 		x = x + x - (q & -(x >= qhalf));
 	}
-	DBG_ASSERT(HERE, x < q, "twopmodq63 : x0 < q");
+	DBG_ASSERT(x < q, "twopmodq63 : x0 < q");
 
 #if FAC_DEBUG
 	fprintf(stderr, "twopmodq63 : x0 = %s, q = %s\n", &str0[convert_uint64_base10_char(str0, x)], &str1[convert_uint64_base10_char(str1, q)] );
@@ -454,7 +454,7 @@ uint64 twopmodq63(uint64 p, uint64 q)
 uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 {
 //int dbg = ( (p == (1ull<<32)) && ( (k0 == 2958ull) || (k1 == 2958ull) || (k2 == 2958ull) || (k3 == 2958ull) ) );
-//if(dbg) printf("Hit! k0-3 = %llu, %llu, %llu, %llu\n",k0, k1, k2, k3);
+//if(dbg) printf("Hit! k0-3 = %" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "\n",k0, k1, k2, k3);
 	 int32 j;
 	uint32 q32_0, q32_1, q32_2, q32_3;
 	uint32 qinv32_0, qinv32_1, qinv32_2, qinv32_3;
@@ -483,7 +483,7 @@ uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 	qhalf3 = q3>>1;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
 
 	// This gives 4-bit inverse:
 	q32_0 = (uint32)q0;
@@ -552,7 +552,7 @@ uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 		x3 = x3 + x3 - (q3 & -(x3 >= qhalf3));
 	}
 
-//if(dbg) printf("q1 = %llu: x1 = %llu\n",q1,x1);
+//if(dbg) printf("q1 = %" PRIu64 ": x1 = %" PRIu64 "\n",q1,x1);
 	for(j = start_index-2; j >= 0; j--)
 	{
 	/*...x^2 mod q is returned in x. On MIPS, we discard the lower half of DMULTU(q,x*y*qinv).	*/
@@ -618,14 +618,14 @@ uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 			if(x2 >= q2) x2 -= q2;
 			if(x3 >= q3) x3 -= q3;
 		#endif
-//if(dbg) printf("2*x^2 = %llu\n",x1);
+//if(dbg) printf("2*x^2 = %" PRIu64 "\n",x1);
  		} else {
-//if(dbg) printf("  x^2 = %llu\n",x1);
+//if(dbg) printf("  x^2 = %" PRIu64 "\n",x1);
 		}
 	}
 
 	/*...Double and return.	These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */
-//if(dbg) printf("xout = %llu\n",x1+x1-q1+FERMAT);
+//if(dbg) printf("xout = %" PRIu64 "\n",x1+x1-q1+FERMAT);
 	r = 0;
 	if(x0+x0-q0+FERMAT == 1)r += 1;
 	if(x1+x1-q1+FERMAT == 1)r += 2;
@@ -659,7 +659,7 @@ uint64 twopmodq63_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
 
 	// This gives 4-bit inverse:
 	q32_0 = (uint32)q0;
@@ -927,7 +927,7 @@ uint64 twopmodq63_x8(uint64 q0, uint64 q1, uint64 q2, uint64 q3, uint64 q4, uint
 	uint64 lead6, pshift6, qinv6, zshift6, x6, lo6, hi6;
 	uint64 lead7, pshift7, qinv7, zshift7, x7, lo7, hi7;
 
-	DBG_ASSERT(HERE, (q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq63_x8: Inputs nonmonotone!");
+	DBG_ASSERT((q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq63_x8: Inputs nonmonotone!");
 
 	pshift0 = q0 + 63;
 	pshift1 = q1 + 63;
@@ -977,7 +977,7 @@ uint64 twopmodq63_x8(uint64 q0, uint64 q1, uint64 q2, uint64 q3, uint64 q4, uint
 	zshift7 = 63 - lead7;	zshift7 <<= 1;	pshift7 = ~pshift7;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
 
 	qinv0 = (q0+q0+q0) ^ (uint64)2;
 	qinv1 = (q1+q1+q1) ^ (uint64)2;
@@ -1195,7 +1195,7 @@ uint64 twopmmodq64(uint64 p, uint64 q)
 //	debug = (q == 640126220763136ull);	/* Uncomment (debug = ...) part and customize q to enable debug-printing */
 	uint32 curr_bit, leadb, start_index, nshift;
 	uint64 pshift, qhalf, qinv, x, rsqr;
-	if(debug) printf("twopmmodq64: computing 2^%llu (mod %llu)\n",p,q);
+	if(debug) printf("twopmmodq64: computing 2^%" PRIu64 " (mod %" PRIu64 ")\n",p,q);
 	// If p <= 64, directly compute 2^p (mod q):
 	if(p < 64)
 		return (1ull < p) % q;
@@ -1209,7 +1209,7 @@ uint64 twopmmodq64(uint64 p, uint64 q)
 	if(nshift) {
 		// p >= nshift guaranteed here:
 		q >>= nshift; p -= nshift;	// Right-shift dividend by (nshift) bits; for 2^p this means subtracting nshift from p
-		if(debug) printf("Removed power-of-2 from q: q' = (q >> %u) = %llu\n",nshift,q);
+		if(debug) printf("Removed power-of-2 from q: q' = (q >> %u) = %" PRIu64 "\n",nshift,q);
 	}
 	qhalf  = q>>1;	/* = (q-1)/2, since q odd. */
 	// Extract leftmost 7 bits of (p - 64); if > 64, use leftmost 6 instead:
@@ -1222,7 +1222,7 @@ uint64 twopmmodq64(uint64 p, uint64 q)
 		start_index = 57-j;
 	}
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, (q & 0x1) && (q > 1), "q must be odd > 1!");
+	ASSERT((q & 0x1) && (q > 1), "q must be odd > 1!");
 	qinv = (q+q+q) ^ (uint64)2;
 	for(j = 0; j < 4; j++)
 		qinv = qinv*((uint64)2 - q*qinv);
@@ -1237,16 +1237,16 @@ uint64 twopmmodq64(uint64 p, uint64 q)
 		MONT_MUL64(x,rsqr, q,qinv, x);	// x*R (mod q) = MONT_MUL(x,R^2 (mod q),q,qinv)
  	}
 	if(debug) {
-		printf("leadb = %u, x0 = %llu\n",leadb,x);
-		printf("pshift = p - %u = %llu\n",64,pshift);
+		printf("leadb = %u, x0 = %" PRIu64 "\n",leadb,x);
+		printf("pshift = p - %u = %" PRIu64 "\n",64,pshift);
 		pow = leadb + 64;
-		printf("twopmmodq64: Initial power = 2^(%u+64) = 2^%u mod q' = %llu\n",leadb,pow,x);
+		printf("twopmmodq64: Initial power = 2^(%u+64) = 2^%u mod q' = %" PRIu64 "\n",leadb,pow,x);
 		printf("twopmmodq64: Looping over %u remaining bits in power:\n",start_index);
 	}
 	for(j = start_index-1; j >= 0; j--) {
 		curr_bit = (pshift >> j) & (uint64)1;
 		MONT_SQR64(x,q,qinv,x);
-		if(debug) { pow = 2*pow + curr_bit - 64; printf("\tJ = %2u: [bit = %u]pow = %u, x = %llu\n",j,curr_bit,pow,x); }
+		if(debug) { pow = 2*pow + curr_bit - 64; printf("\tJ = %2u: [bit = %u]pow = %u, x = %" PRIu64 "\n",j,curr_bit,pow,x); }
 		if(curr_bit) {
 			if(x > qhalf) {	/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 				x = x + x;	x -= q;
@@ -1257,12 +1257,12 @@ uint64 twopmmodq64(uint64 p, uint64 q)
 	}
 	// May 2022: Since pre-subtracted 64 from computed powermod exponent, no need to un-scale the loop output anymore:
 	// MONT_UNITY_MUL64(x,q,qinv,x);
-	if(debug) printf("pow = %u, x = %llu\n",pow,x);
+	if(debug) printf("pow = %u, x = %" PRIu64 "\n",pow,x);
 	// If we applied an initial right-justify shift to the modulus, restore the shift to the
 	// current (partial) remainder and re-add the off-shifted part of the true remainder.
 	if(nshift) {
 		x = (x << nshift);// + rem_save;
-		if(debug) printf("Restoring power-of-2: pow = %u, x *= 2^%u = %llu\n",pow+nshift,nshift,x);
+		if(debug) printf("Restoring power-of-2: pow = %u, x *= 2^%u = %" PRIu64 "\n",pow+nshift,nshift,x);
 	}
 	return x;
 }
@@ -1337,8 +1337,8 @@ void twopmmodq64_q4(uint64 p, uint64 *i0, uint64 *i1, uint64 *i2, uint64 *i3, ui
 	start_index = 58-j;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!");
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
+	ASSERT(q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
 
 	qinv0 = (q0+q0+q0) ^ (uint64)2;
 	qinv1 = (q1+q1+q1) ^ (uint64)2;
@@ -1572,7 +1572,7 @@ uint64 twopmodq64(uint64 p, uint64 q)
 	qhalf  = q>>1;	/* = (q-1)/2, since q odd. */
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q & 1, "q must be odd!");
+	ASSERT(q & 1, "q must be odd!");
 	qinv = (q+q+q) ^ (uint64)2;
 	for(j = 0; j < 4; j++)
 	{
@@ -1595,7 +1595,7 @@ uint64 twopmodq64(uint64 p, uint64 q)
 		x = x + x - (q & -(x >= qhalf));
 	}
 #if FAC_DEBUG
-/*	ASSERT(HERE, x < q, "twopmodq64: x0 < q");	*/
+/*	ASSERT(x < q, "twopmodq64: x0 < q");	*/
   #if 0	/* These appear to be benign: */
 	if(x >= q){ sprintf(char_buf, "twopmodq64: (x0 = %s) >= (q = %s)", &str0[convert_uint64_base10_char(str0, x)], &str1[convert_uint64_base10_char(str1, q)] );	DBG_WARN(HERE, char_buf, STATFILE, !restart); }
   #endif
@@ -1661,7 +1661,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
 
 	// This gives 4-bit inverse:
 	q32_0 = (uint32)q0;
@@ -1834,7 +1834,7 @@ uint64 twopmodq64_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
 
 	// This gives 4-bit inverse:
 	q32_0 = (uint32)q0;
@@ -2101,11 +2101,11 @@ uint64 twopmodq65(uint64 p, uint64 k)
 if(dbg)printf("twopmodq65:\n");
 #endif
 	// Assume q is 65-bits here, so check that during construction of q = 2.k.p+1:
-	q = k*p;	ASSERT(HERE, q+q < q, "q not 65 bits!");
+	q = k*p;	ASSERT(q+q < q, "q not 65 bits!");
 	q = (q << 1) + 1;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q & 1, "q must be odd!");
+	ASSERT(q & 1, "q must be odd!");
 	qinv = (q+q+q) ^ (uint64)2;
 	for(j = 0; j < 4; j++)
 	{
@@ -2147,7 +2147,7 @@ if(dbg)printf("twopmodq65:\n");
 	!	Get SQR_HIGH_EXACT(x) - MULH_EXACT(q, lo*qinv), normalizing as we go.
 	*/
 	#if FAC_DEBUG
-		if(dbg) printf("twopmodq65: while(%llu++ < %llu) || (%llu+=%llu < %llu)\n",A,B,hi,q,y);
+		if(dbg) printf("twopmodq65: while(%" PRIu64 "++ < %" PRIu64 ") || (%" PRIu64 "+=%" PRIu64 " < %" PRIu64 ")\n",A,B,hi,q,y);
 	#endif
 		while(A < B || (A == B && hi < y))	/* SQR_HIGH_EXACT(x) < MULH_EXACT(q, lo*qinv); add q until >= . */
 		{
@@ -2157,7 +2157,7 @@ if(dbg)printf("twopmodq65:\n");
 
 	/* Do the subtraction. Result is in (hi,A). */
 	#if FAC_DEBUG
-		/*ASSERT(HERE, A > B || (A == B && hi >= y), "twopmodq65 : A > B || (A == B && hi >= y)"); */
+		/*ASSERT(A > B || (A == B && hi >= y), "twopmodq65 : A > B || (A == B && hi >= y)"); */
 	#endif
 		A -= B; x = hi; hi -= y;
 		A -= (hi > x);	/* had a borrow */
@@ -2165,25 +2165,25 @@ if(dbg)printf("twopmodq65:\n");
 	/* ...and normalize. Result is in (x, A).. */
 		x = hi;
 	#if FAC_DEBUG
-		if(dbg) printf("twopmodq65: while(A=%llu-- > 1) || (%llu-=%llu >=%llu)\n",A,x,q,q);
+		if(dbg) printf("twopmodq65: while(A=%" PRIu64 "-- > 1) || (%" PRIu64 "-=%" PRIu64 " >=%" PRIu64 ")\n",A,x,q,q);
 	#endif
 		while(A > 1 || (A == 1 && x >= q)) {
 			--A; x -= q;
 		#if FAC_DEBUG
-			if(dbg) printf("twopmodq65: A = %llu, x-q = %llu, q = %llu, hi = %llu\n",A,x,q,hi);
-			if(dbg) printf("twopmodq65: (x > hi) = %llu\n",(x > hi));
-			if(dbg) printf("twopmodq65: (x <=hi) = %llu\n",(x <=hi));
-			if(dbg) printf("twopmodq65: (hi < x) = %llu\n",(hi < x));
-			if(dbg) printf("twopmodq65: (hi <=x) = %llu\n",(hi <=x));
-			if(dbg) printf("twopmodq65: (hi >=x) = %llu\n",(hi >=x));
-			if(dbg) printf("twopmodq65: (hi -x ) = %llu - %llu = %llu\n",hi,x,(hi - x));
+			if(dbg) printf("twopmodq65: A = %" PRIu64 ", x-q = %" PRIu64 ", q = %" PRIu64 ", hi = %" PRIu64 "\n",A,x,q,hi);
+			if(dbg) printf("twopmodq65: (x > hi) = %" PRIu64 "\n",(x > hi));
+			if(dbg) printf("twopmodq65: (x <=hi) = %" PRIu64 "\n",(x <=hi));
+			if(dbg) printf("twopmodq65: (hi < x) = %" PRIu64 "\n",(hi < x));
+			if(dbg) printf("twopmodq65: (hi <=x) = %" PRIu64 "\n",(hi <=x));
+			if(dbg) printf("twopmodq65: (hi >=x) = %" PRIu64 "\n",(hi >=x));
+			if(dbg) printf("twopmodq65: (hi -x ) = %" PRIu64 " - %" PRIu64 " = %" PRIu64 "\n",hi,x,(hi - x));
 			if(dbg) printf("twopmodq65: (hi.-x.) = %lf  - %lf  = %lf \n",(double)hi,(double)x,((double)hi - (double)x));
 		#endif
 			/* had a borrow: */
 			A -= (x > hi);
 		#if FAC_DEBUG
-			ASSERT(HERE, ((double)x > (double)hi) == (x > hi),"((double)x > (double)hi) == (x > hi)");
-			ASSERT(HERE, (int64)A >=0,"(int64)A >=0");
+			ASSERT(((double)x > (double)hi) == (x > hi),"((double)x > (double)hi) == (x > hi)");
+			ASSERT((int64)A >=0,"(int64)A >=0");
 		#endif
 		}
 
@@ -2195,7 +2195,7 @@ if(dbg)printf("twopmodq65:\n");
 
 			/* ...and normalize the result. */
 		#if FAC_DEBUG
-			if(dbg) printf("twopmodq65: while(B=%llu-- > 1) || (%llu-=%llu >=%llu)\n",B,y,q,q);
+			if(dbg) printf("twopmodq65: while(B=%" PRIu64 "-- > 1) || (%" PRIu64 "-=%" PRIu64 " >=%" PRIu64 ")\n",B,y,q,q);
 		#endif
 			while(B > 1 || (B == 1 && y >= q))
 			{
@@ -2214,14 +2214,14 @@ if(dbg)printf("twopmodq65:\n");
 
 	/* ...and normalize the result. */
 #if FAC_DEBUG
-	if(dbg) printf("twopmodq65:#while(%llu-- > 1) || (%llu-=%llu >=%llu)\n",B,y,q,q);
+	if(dbg) printf("twopmodq65:#while(%" PRIu64 "-- > 1) || (%" PRIu64 "-=%" PRIu64 " >=%" PRIu64 ")\n",B,y,q,q);
 #endif
 	while(B > 1 || (B == 1 && y >= q)) {
 		--B; t = y; y -= q;
 		B -= (y > t);		/* had a borrow */
 	}
 
-	/*if(y == (uint64)1) ASSERT(HERE, B == 0, "twopmodq65 : B == 0");*/
+	/*if(y == (uint64)1) ASSERT(B == 0, "twopmodq65 : B == 0");*/
 	return (y + FERMAT == 1ull && B == 0ull);
 }
 
@@ -2250,7 +2250,7 @@ uint64 twopmodq65_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
 
 	qinv0 = (q0+q0+q0) ^ (uint64)2;
 	qinv1 = (q1+q1+q1) ^ (uint64)2;
@@ -2401,7 +2401,7 @@ uint64 twopmodq65_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
 
 	qinv0 = (q0+q0+q0) ^ (uint64)2;
 	qinv1 = (q1+q1+q1) ^ (uint64)2;
diff --git a/src/twopmodq100.c b/src/twopmodq100.c
index e9dc1734..cc2eb64f 100755
--- a/src/twopmodq100.c
+++ b/src/twopmodq100.c
@@ -54,7 +54,7 @@
 	  #endif
 		uint64 *fq0[32],*fq1[32], *fqinv0[32],*fqinv1[32], *fx0[32],*fx1[32];
 		for(j = 0; j < 32; j++) {
-			ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!");
+			ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!");
 		}
 	#else
 		const double crnd = 3.0*0x4000000*0x2000000;	// Const used to emulate DNINT(x) and (when multiplied by BASE) 2^50 * DNINT(x*2^bpow2)
@@ -73,12 +73,12 @@
 		double *fq0[32],*fq1[32], *fqinv0[32],*fqinv1[32], *fx0[32],*fx1[32], kdbl[32];
 		// AVX-512 Foundation lacks the needed DQ extensions, so use HLL to convert kvec entries to double:
 		for(j = 0; j < 32; j++) {
-			ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!");
+			ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!");
 			kdbl[j] = (double)k[j];
 		}
-		ASSERT(HERE, base  == (double)(1ull<<bpow2) && base <= TWO48FLOAT && base *binv  == 1.0, "Current version only supports bpow2 <= 48; base*binv must == 1.0!");
-		ASSERT(HERE, base0 == (double)(1ull<< pow0)                       && base0*binv0 == 1.0, "base0,binv0 check fails!");
-		ASSERT(HERE, base1 == (double)(1ull<< pow1)                       && base1*binv1 == 1.0, "base1,binv1 check fails!");
+		ASSERT(base  == (double)(1ull<<bpow2) && base <= TWO48FLOAT && base *binv  == 1.0, "Current version only supports bpow2 <= 48; base*binv must == 1.0!");
+		ASSERT(base0 == (double)(1ull<< pow0)                       && base0*binv0 == 1.0, "base0,binv0 check fails!");
+		ASSERT(base1 == (double)(1ull<< pow1)                       && base1*binv1 == 1.0, "base1,binv1 check fails!");
 	#endif
 		uint32 mul_width = bpow2<<1;
 		if(p != psave) {
@@ -96,7 +96,7 @@
 			}
 			// lead7 in [bpow2, 2*bpow2):
 			zshift = mul_width-1 - lead7;	// zshift in [0,bpow2-1]
-			ASSERT(HERE, zshift < bpow2, "zshift out of expected range!");
+			ASSERT(zshift < bpow2, "zshift out of expected range!");
 			pshift = ~pshift;
 		}
 
@@ -115,10 +115,10 @@
 			#endif
 				fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 			#ifndef COMPILER_TYPE_GCC
-				ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+				ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 			#endif
-				ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-				ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+				ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+				ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			}
 			if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 				free((void *)sc_arr);	sc_arr=0x0;
@@ -126,9 +126,9 @@
 			// Alloc the local-memory block the #bytes multiplier has plenty of extra room built in, e.g. for debug-data-writes:
 		#ifdef USE_AVX512_I
 
-			sc_arr = ALLOC_UINT64(sc_arr, 0x140*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_UINT64(sc_arr, 0x140*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = (uint64 *)ALIGN_VEC_U64(sc_arr);	// Force vec_u64-alignment
-			ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		  #ifdef MULTITHREAD
 			__r0  = sc_ptr;
 		  #else
@@ -157,9 +157,9 @@
 		#else	// Default AVX-512 floating-point-FMA mode
 		/***************************************************/
 
-			sc_arr = ALLOC_DOUBLE(sc_arr, 0x140*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_DOUBLE(sc_arr, 0x140*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = (uint64 *)ALIGN_VEC_DBL(sc_arr);	// Force vec_u64-alignment
-			ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		  #ifdef MULTITHREAD
 			__r0  = sc_ptr;
 		  #else
@@ -190,7 +190,7 @@
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
 
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		sc_ptr = __r0 + thr_id*0x140;
 
 	  #ifdef USE_AVX512_I
@@ -237,7 +237,7 @@
 	#ifdef MUL_LOHI64_SUBROUTINE
 		#error MUL_LOHI64_SUBROUTINE defined!
 	#endif
-		ASSERT(HERE, (p >> 63) == 0, "twopmodq100_q32: p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "twopmodq100_q32: p must be < 2^63!");
 
 	#ifdef USE_AVX512_I
 
@@ -416,7 +416,7 @@
 
 	#endif
 		// Init the modpow residue:
-	ASSERT(HERE, zshift < 48, "zshift out of expected range!");
+	ASSERT(zshift < 48, "zshift out of expected range!");
 		dtmp = 1ull<<zshift;	VEC_DBL_INIT_8((vec_dbl*)fx0[0], dtmp);	VEC_DBL_INIT_8((vec_dbl*)fx0[8], dtmp);	VEC_DBL_INIT_8((vec_dbl*)fx0[16], dtmp);	VEC_DBL_INIT_8((vec_dbl*)fx0[24], dtmp);
 		dtmp = 0ull;			VEC_DBL_INIT_8((vec_dbl*)fx1[0], dtmp);	VEC_DBL_INIT_8((vec_dbl*)fx1[8], dtmp);	VEC_DBL_INIT_8((vec_dbl*)fx1[16], dtmp);	VEC_DBL_INIT_8((vec_dbl*)fx1[24], dtmp);
 /* Debug:
diff --git a/src/twopmodq128.c b/src/twopmodq128.c
index beacf1bf..b42ab2ae 100755
--- a/src/twopmodq128.c
+++ b/src/twopmodq128.c
@@ -106,7 +106,7 @@ uint128 twopmmodq128(uint128 p, uint128 q)
 	}
 #endif
 	// Find inverse (mod 2^128) of q; q must be odd for Montgomery-style modmul to work:
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmmodq128 : q must be odd for Montgomery-style modmul!");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmmodq128 : q must be odd for Montgomery-style modmul!");
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = (uint64)0;
 	/* Compute qinv  = q^-1 (mod R = 2^128) via Newton iteration qinv = qinv*(2 - q*qinv), starting with
@@ -268,7 +268,7 @@ if(dbg) printf("twopmodq128:\n");
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq128 : q must be odd for Montgomery-style modmul!");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq128 : q must be odd for Montgomery-style modmul!");
 #endif
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = (uint64)0;
@@ -307,7 +307,7 @@ if(dbg) printf("twopmodq128:\n");
 #endif
 
 #if FAC_DEBUG
-	ASSERT(HERE, qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
+	ASSERT(qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
 	if(dbg) printf("q    = %s\n", &char_buf[convert_uint128_base10_char(char_buf, q   )]);
 	if(dbg) printf("qinv = %s\n", &char_buf[convert_uint128_base10_char(char_buf, qinv)]);
 #endif
@@ -337,7 +337,7 @@ if(dbg) printf("twopmodq128:\n");
 	if(TEST_BIT128(pshift, j))
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT128(x, q), "twopmodq128 : CMPULT128(x,q)");
+		ASSERT(CMPULT128(x, q), "twopmodq128 : CMPULT128(x,q)");
 	#endif
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(CMPUGT128(x, qhalf)){ ADD128(x, x, x); SUB128(x, q, x); }else{ ADD128(x, x, x); }
@@ -375,7 +375,7 @@ if(dbg) printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_b
 		if(TEST_BIT128(pshift, j))
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT128(x, q), "twopmodq128 : CMPULT128(x,q)");
+			ASSERT(CMPULT128(x, q), "twopmodq128 : CMPULT128(x,q)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT128(x, qhalf)){ ADD128(x, x, x); SUB128(x, q, x); }else{ ADD128(x, x, x); }
@@ -408,7 +408,7 @@ if(dbg) printf("x0 = %s\n", &char_buf[convert_uint128_base10_char(char_buf, x)])
 */
 uint64 twopmodq128x2(uint64 *p_in, uint64 k)
 {
-	ASSERT(HERE, p_in != 0x0, "Null p_in pointer!");
+	ASSERT(p_in != 0x0, "Null p_in pointer!");
 #if FAC_DEBUG
 	int dbg = STREQ(&char_buf[convert_mi64_base10_char(char_buf, p_in, 2, 0)], "0");
 #endif
@@ -432,7 +432,7 @@ if(dbg) printf("twopmodq128x2:\n");
 
 	// Use x as tmp to hold 2*p:
 	ADD128(p,p, x);
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x, k, (uint64 *)&q, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x, k, (uint64 *)&q, 2), "q must be < 2^128!");
 	q.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 
 	RSHIFT_FAST128(q, 1, qhalf);	/* = (q-1)/2, since q odd. */
@@ -481,7 +481,7 @@ if(dbg) printf("twopmodq128x2:\n");
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq128x2 : q must be odd for Montgomery-style modmul!");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq128x2 : q must be odd for Montgomery-style modmul!");
 #endif
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = (uint64)0;
@@ -520,7 +520,7 @@ if(dbg) printf("twopmodq128x2:\n");
 #endif
 
 #if FAC_DEBUG
-	ASSERT(HERE, qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128x2 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
+	ASSERT(qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128x2 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
 	if(dbg) printf("q    = %s\n", &char_buf[convert_uint128_base10_char(char_buf, q   )]);
 	if(dbg) printf("qinv = %s\n", &char_buf[convert_uint128_base10_char(char_buf, qinv)]);
 #endif
@@ -553,7 +553,7 @@ if(dbg) printf("twopmodq128x2:\n");
 	if(TEST_BIT128(pshift, j))
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT128(x,q), "twopmodq128x2 : CMPULT128(x,q)");
+		ASSERT(CMPULT128(x,q), "twopmodq128x2 : CMPULT128(x,q)");
 	#endif
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(CMPUGT128(x, qhalf)){ ADD128(x, x, x); SUB128(x, q, x); }else{ ADD128(x, x, x); }
@@ -590,7 +590,7 @@ if(dbg) printf("twopmodq128x2:\n");
 		if(TEST_BIT128(pshift, j))
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT128(x,q), "twopmodq128x2 : CMPULT128(x,q)");
+			ASSERT(CMPULT128(x,q), "twopmodq128x2 : CMPULT128(x,q)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT128(x, qhalf)){ ADD128(x, x, x); SUB128(x, q, x); }else{ ADD128(x, x, x); }
@@ -622,7 +622,7 @@ if(dbg) printf("twopmodq128x2:\n");
 // Second version of above, which takes factor candidate q in uint128 form:
 uint64 twopmodq128x2B(uint64 *p_in, uint128 q)
 {
-	ASSERT(HERE, p_in != 0x0, "Null p_in pointer!");
+	ASSERT(p_in != 0x0, "Null p_in pointer!");
 	 int32 j;	/* This needs to be signed because of the LR binary exponentiation. */
 	uint64 lo64;
 	uint128 p, qhalf, qinv, x, lo, hi;
@@ -659,7 +659,7 @@ uint64 twopmodq128x2B(uint64 *p_in, uint128 q)
 	}
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq128x2B: q must be odd for Montgomery-style modmul!");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq128x2B: q must be odd for Montgomery-style modmul!");
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = (uint64)0;
 	for(j = 0; j < 4; j++) {
@@ -725,7 +725,7 @@ uint64 twopmodq128x2B(uint64 *p_in, uint128 q)
 /*** 4-trial-factor version ***/
 uint64 twopmodq128_q4(uint64* p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 {
-	ASSERT(HERE, p_in != 0x0, "Null p_in pointer!");
+	ASSERT(p_in != 0x0, "Null p_in pointer!");
 	 int32 j;
 	uint64 lo64_0, lo64_1, lo64_2, lo64_3, lead7, r;
 	uint128 p, q0, q1, q2, q3
@@ -747,10 +747,10 @@ uint64 twopmodq128_q4(uint64* p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 
 	// Use x0 as tmp to hold 2*p:
 	ADD128(p,p, x0);
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 2), "q must be < 2^128!");
 
 	q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 	q1.d0 += 1;
@@ -967,7 +967,7 @@ uint64 twopmodq128_q4(uint64* p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 /*** 8-trial-factor version ***/
 uint64 twopmodq128_q8(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint64 k4, uint64 k5, uint64 k6, uint64 k7)
 {
-	ASSERT(HERE, p_in != 0x0, "Null p_in pointer!");
+	ASSERT(p_in != 0x0, "Null p_in pointer!");
 #if FAC_DEBUG
 	int dbg = 0;
 #endif
@@ -995,14 +995,14 @@ if(dbg) printf("twopmodq128_q8:\n");
 
 	// Use x0 as tmp to hold 2*p:
 	ADD128(p,p, x0);
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k4, (uint64 *)&q4, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k5, (uint64 *)&q5, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k6, (uint64 *)&q6, 2), "q must be < 2^128!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k7, (uint64 *)&q7, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k4, (uint64 *)&q4, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k5, (uint64 *)&q5, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k6, (uint64 *)&q6, 2), "q must be < 2^128!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k7, (uint64 *)&q7, 2), "q must be < 2^128!");
 
 	q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 	q1.d0 += 1;
@@ -1159,7 +1159,7 @@ if(dbg) printf("twopmodq128_q8:\n");
 	for(j = start_index-2; j >= 0; j--)
 	{
 #if FAC_DEBUG
-if(dbg) printf("A: x = %20llu + 2^64* %20llu\n",x0.d0,x0.d1);
+if(dbg) printf("A: x = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0,x0.d1);
 #endif
 	#if THREE_OP128
 		/* Fused version of all 3 of the above function calls. Surprisingly, on Alpha this was significantly slower
@@ -1206,8 +1206,8 @@ if(dbg) printf("A: x = %20llu + 2^64* %20llu\n",x0.d0,x0.d1);
 		, x6, lo6, hi6
 		, x7, lo7, hi7);
 #if FAC_DEBUG
-if(dbg) printf("B: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
-if(dbg) printf("B: h = %20llu + 2^64* %20llu\n",hi0.d0,hi0.d1);
+if(dbg) printf("B: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
+if(dbg) printf("B: h = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",hi0.d0,hi0.d1);
 #endif
 
 		/* For unknown reasons, the 8-operand version of MULL128 was slower than one-at-a-time. */
@@ -1242,7 +1242,7 @@ if(dbg) printf("B: h = %20llu + 2^64* %20llu\n",hi0.d0,hi0.d1);
 		MULL128(lo7, qinv7, lo7);
 		*/
 #if FAC_DEBUG
-if(dbg) printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
+if(dbg) printf("C: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
 #endif
 
 		MULH128_q8(
@@ -1275,7 +1275,7 @@ if(dbg) printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
 		MULH128(lo7, q7, lo7);
 		*/
 #if FAC_DEBUG
-if(dbg) printf("D: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
+if(dbg) printf("D: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
 #endif
 	#endif
 		/* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */
@@ -1288,7 +1288,7 @@ if(dbg) printf("D: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
 		if(CMPULT128(hi6, lo6)) { SUB128(q6, lo6, lo6);	ADD128(lo6, hi6, x6); } else { SUB128(hi6, lo6, x6); }
 		if(CMPULT128(hi7, lo7)) { SUB128(q7, lo7, lo7);	ADD128(lo7, hi7, x7); } else { SUB128(hi7, lo7, x7); }
 #if FAC_DEBUG
-if(dbg) printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1);
+if(dbg) printf("j = %2d, Res = %20" PRIu64 " + 2^64* %20" PRIu64,j,x0.d0,x0.d1);
 #endif
 
 		if(TEST_BIT128(pshift, j))
@@ -1303,7 +1303,7 @@ if(dbg) printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1);
 			if(CMPUGT128(x6, qhalf6)){ ADD128(x6, x6, x6); SUB128(x6, q6, x6); }else{ ADD128(x6, x6, x6); }
 			if(CMPUGT128(x7, qhalf7)){ ADD128(x7, x7, x7); SUB128(x7, q7, x7); }else{ ADD128(x7, x7, x7); }
 #if FAC_DEBUG
-if(dbg) printf(" *2 = %20llu + 2^64* %20llu",x0.d0,x0.d1);
+if(dbg) printf(" *2 = %20" PRIu64 " + 2^64* %20" PRIu64,x0.d0,x0.d1);
 #endif
 		}
 #if FAC_DEBUG
@@ -1341,7 +1341,7 @@ if(dbg) printf("\n");
 	SUB128(x7, q7, x7);
 
 #if FAC_DEBUG
-if(dbg) printf("x0 = %20llu + 2^64* %20llu\n",x0.d0, x0.d1);
+if(dbg) printf("x0 = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0, x0.d1);
 #endif
 
 	/* Only do the full 128-bit (Xj== 1) check if the bottom 64 bits of Xj == 1: */
diff --git a/src/twopmodq128_96.c b/src/twopmodq128_96.c
index e8eefe0c..88d77c49 100755
--- a/src/twopmodq128_96.c
+++ b/src/twopmodq128_96.c
@@ -58,7 +58,7 @@ uint64 twopmodq128_96(uint64 p, uint64 k)
 if(dbg)printf("twopmodq128_96:\n");
 #endif
 
-	ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+	ASSERT((p >> 63) == 0, "p must be < 2^63!");
 	q.d0 = p+p;	q.d1 = 0;
 #ifdef MUL_LOHI64_SUBROUTINE
 	MUL_LOHI64(q.d0, k,&q.d0,&q.d1);
@@ -66,7 +66,7 @@ if(dbg)printf("twopmodq128_96:\n");
 	MUL_LOHI64(q.d0, k, q.d0, q.d1);
 #endif
 	q.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
-	ASSERT(HERE, (q.d1 >> 32) == 0, "(q.d1 >> 32) != 0");
+	ASSERT((q.d1 >> 32) == 0, "(q.d1 >> 32) != 0");
 
 	if(first_entry || p != psave)
 	{
@@ -105,7 +105,7 @@ if(dbg)printf("twopmodq128_96:\n");
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq128_96 : (q.d0 & (uint64)1) == 1");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq128_96 : (q.d0 & (uint64)1) == 1");
 #endif
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = (uint64)0;
@@ -144,7 +144,7 @@ if(dbg)printf("twopmodq128_96:\n");
 #endif
 
 #if FAC_DEBUG
-	ASSERT(HERE, qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128_96 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
+	ASSERT(qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128_96 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
 #endif
 
 #if FAC_DEBUG
@@ -182,7 +182,7 @@ if(dbg)printf("twopmodq128_96:\n");
 	if((pshift >> j) & (uint64)1)
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)");
+		ASSERT(CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)");
 	#endif
 		ADD128(x,x,x);
 		if(CMPULE128(q,x)) SUB128(x,q,x);
@@ -223,7 +223,7 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_bu
 		if((pshift >> j) & (uint64)1)
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)");
+			ASSERT(CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)");
 		#endif
 			ADD128(x,x,x);	/* Since we're using 128-bit arithmetic for the add, x+x cannot overflow. */
 			if(CMPULE128(q,x)) SUB128(x,q,x);
@@ -298,7 +298,7 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 	if(dbg)	printf("start_index = %u\n", (uint32)start_index);
 #endif
 
-	ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+	ASSERT((p >> 63) == 0, "p must be < 2^63!");
 	q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p;
 	q0.d1 = q1.d1 = q2.d1 = q3.d1 = 0;
 #ifdef MUL_LOHI64_SUBROUTINE
@@ -312,10 +312,10 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 	MUL_LOHI64(q2.d0, k2, q2.d0, q2.d1);
 	MUL_LOHI64(q3.d0, k3, q3.d0, q3.d1);
 #endif
-	ASSERT(HERE, (q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0");
-	ASSERT(HERE, (q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0");
-	ASSERT(HERE, (q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0");
-	ASSERT(HERE, (q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0");
+	ASSERT((q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0");
+	ASSERT((q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0");
+	ASSERT((q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0");
+	ASSERT((q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0");
 
 	q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 	q1.d0 += 1;
@@ -324,10 +324,10 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q0.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q1.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q1.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q2.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q2.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q3.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q3.d0 & (uint64)1) == 1");
+	ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q0.d0 & (uint64)1) == 1");
+	ASSERT((q1.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q1.d0 & (uint64)1) == 1");
+	ASSERT((q2.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q2.d0 & (uint64)1) == 1");
+	ASSERT((q3.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q3.d0 & (uint64)1) == 1");
 #endif
 	qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2;	qinv0.d1 = (uint64)0;
 	qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2;	qinv1.d1 = (uint64)0;
@@ -420,10 +420,10 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 	if((pshift >> j) & (uint64)1)
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)");
-		ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)");
-		ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)");
-		ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)");
+		ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)");
+		ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)");
+		ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)");
+		ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)");
 	#endif
 		ADD128(x0, x0, x0);
 		ADD128(x1, x1, x1);
@@ -468,10 +468,10 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 
 		/* For unknown reasons, the 8-operand version of MULL128 was slower than one-at-a-time. */
 #if FAC_DEBUG
-	ASSERT(HERE, CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(SQR_LO)");
-	ASSERT(HERE, hi.d1 == 0      , "twopmodq128_96_q4 : hi.d1 != 0");
+	ASSERT(CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(SQR_LO)");
+	ASSERT(hi.d1 == 0      , "twopmodq128_96_q4 : hi.d1 != 0");
 	hi64 = hi.d0;
-	ASSERT(HERE, hi64 == hi0     , "twopmodq128_96_q4 : CMPEQ128(SQR_HI)");
+	ASSERT(hi64 == hi0     , "twopmodq128_96_q4 : CMPEQ128(SQR_HI)");
 	x=lo0;y=qinv0;
 	MULL128(x,y,lo);
 #endif
@@ -484,11 +484,11 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 	/* Need to be careful about the order of the 2 inputs here,
 	as MULH128x96 assumes the 2nd input is the one which is < 2^96: */
 #if FAC_DEBUG
-	ASSERT(HERE, CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULL128)");
+	ASSERT(CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULL128)");
 	x=lo0;y=q0;
 	MULH128(x,y,lo);
 	MULH128(y,x,hi);
-	ASSERT(HERE, CMPEQ128(lo, hi), "twopmodq128_96_q4 : MULH(X,Y) != MULH(Y,X)");
+	ASSERT(CMPEQ128(lo, hi), "twopmodq128_96_q4 : MULH(X,Y) != MULH(Y,X)");
 #endif
 		MULH128x96_q4(
 		  lo0, q0, lo0
@@ -503,7 +503,7 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 		, q3, lo3, lo3);
 	#endif
 #if FAC_DEBUG
-	ASSERT(HERE, CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULH())");
+	ASSERT(CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULH())");
 	/* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */
 	if(lo.d1 != 0 || hi64 < lo.d0)
 	{
@@ -523,7 +523,7 @@ if(dbg)printf("twopmodq128_96_q4:\n");
 		if(lo2.d1 != 0 || hi2 < lo2.d0){ SUB128(q2, lo2, x2);	x2.d0 +=  hi2; x2.d1 += (x2.d0 < hi2); } else { x2.d0 =  hi2 - lo2.d0; x2.d1 = (uint64)0; }
 		if(lo3.d1 != 0 || hi3 < lo3.d0){ SUB128(q3, lo3, x3);	x3.d0 +=  hi3; x3.d1 += (x3.d0 < hi3); } else { x3.d0 =  hi3 - lo3.d0; x3.d1 = (uint64)0; }
 #if FAC_DEBUG
-	ASSERT(HERE, CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())");
+	ASSERT(CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())");
 #endif
 
 #if FAC_DEBUG
@@ -537,10 +537,10 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_bu
 	if(CMPULE128(q0,x)) SUB128(x,q0,x);
 #endif
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)");
-			ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)");
-			ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)");
-			ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)");
+			ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)");
+			ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)");
+			ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)");
+			ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)");
 		#endif
 			ADD128(x0, x0, x0);
 			ADD128(x1, x1, x1);
@@ -557,7 +557,7 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_bu
 		}
 #if FAC_DEBUG
 if(dbg)printf("\n");
-	ASSERT(HERE, CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())");
+	ASSERT(CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())");
 #endif
 	}
 
@@ -638,7 +638,7 @@ if(dbg)printf("twopmodq128_96_q8:\n");
 		pshift = ~pshift;
 	}
 
-	ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+	ASSERT((p >> 63) == 0, "p must be < 2^63!");
 	q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p;
 	q0.d1 = q1.d1 = q2.d1 = q3.d1 = q4.d1 = q5.d1 = q6.d1 = q7.d1 = 0;
 #ifdef MUL_LOHI64_SUBROUTINE
@@ -669,28 +669,28 @@ if(dbg)printf("twopmodq128_96_q8:\n");
 	q5.d0 += 1;
 	q6.d0 += 1;
 	q7.d0 += 1;
-	ASSERT(HERE, (q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0");
-	ASSERT(HERE, (q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0");
-	ASSERT(HERE, (q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0");
-	ASSERT(HERE, (q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0");
-	ASSERT(HERE, (q4.d1 >> 32) == 0, "(q4.d1 >> 32) != 0");
-	ASSERT(HERE, (q5.d1 >> 32) == 0, "(q5.d1 >> 32) != 0");
-	ASSERT(HERE, (q6.d1 >> 32) == 0, "(q6.d1 >> 32) != 0");
-	ASSERT(HERE, (q7.d1 >> 32) == 0, "(q7.d1 >> 32) != 0");
+	ASSERT((q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0");
+	ASSERT((q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0");
+	ASSERT((q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0");
+	ASSERT((q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0");
+	ASSERT((q4.d1 >> 32) == 0, "(q4.d1 >> 32) != 0");
+	ASSERT((q5.d1 >> 32) == 0, "(q5.d1 >> 32) != 0");
+	ASSERT((q6.d1 >> 32) == 0, "(q6.d1 >> 32) != 0");
+	ASSERT((q7.d1 >> 32) == 0, "(q7.d1 >> 32) != 0");
 
 	/*
 	!    Find modular inverse (mod 2^128) of q in preparation for modular multiply.
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q0.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q1.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q1.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q2.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q2.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q3.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q3.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q4.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q4.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q5.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q5.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q6.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q6.d0 & (uint64)1) == 1");
-	ASSERT(HERE, (q7.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q7.d0 & (uint64)1) == 1");
+	ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q0.d0 & (uint64)1) == 1");
+	ASSERT((q1.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q1.d0 & (uint64)1) == 1");
+	ASSERT((q2.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q2.d0 & (uint64)1) == 1");
+	ASSERT((q3.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q3.d0 & (uint64)1) == 1");
+	ASSERT((q4.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q4.d0 & (uint64)1) == 1");
+	ASSERT((q5.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q5.d0 & (uint64)1) == 1");
+	ASSERT((q6.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q6.d0 & (uint64)1) == 1");
+	ASSERT((q7.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q7.d0 & (uint64)1) == 1");
 #endif
 	qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2;	qinv0.d1 = (uint64)0;
 	qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2;	qinv1.d1 = (uint64)0;
@@ -801,14 +801,14 @@ if(dbg)printf("twopmodq128_96_q8:\n");
 	if((pshift >> j) & (uint64)1)
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)");
-		ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)");
-		ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)");
-		ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)");
-		ASSERT(HERE, CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)");
-		ASSERT(HERE, CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)");
-		ASSERT(HERE, CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)");
-		ASSERT(HERE, CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)");
+		ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)");
+		ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)");
+		ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)");
+		ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)");
+		ASSERT(CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)");
+		ASSERT(CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)");
+		ASSERT(CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)");
+		ASSERT(CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)");
 	#endif
 		ADD128(x0, x0, x0);
 		ADD128(x1, x1, x1);
@@ -843,7 +843,7 @@ if(dbg)printf("twopmodq128_96_q8:\n");
 	for(j = start_index-2; j >= 0; j--)
 	{
 #if FAC_DEBUG
-if(dbg)printf("A: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
+if(dbg)printf("A: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
 #endif
 		/* Haven't gotten IA64 version of this working properly yet:
 		SQR_LOHI_INPLACE128_96_q8(
@@ -866,7 +866,7 @@ if(dbg)printf("A: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
 		, x6, lo6, hi6
 		, x7, lo7, hi7);
 #if FAC_DEBUG
-if(dbg)printf("B: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
+if(dbg)printf("B: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
 #endif
 
 		/* For unknown reasons, the 8-operand version of MULL128 was slower than one-at-a-time. */
@@ -880,7 +880,7 @@ if(dbg)printf("B: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
 		, lo6, qinv6
 		, lo7, qinv7);
 #if FAC_DEBUG
-if(dbg)printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
+if(dbg)printf("C: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
 #endif
 	#if(USE_128x96 > 0)
 		/* Need to be careful about the order of the 2 inputs here,
@@ -906,13 +906,13 @@ if(dbg)printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
 		, q7, lo7, lo7);
 	#endif
 #if FAC_DEBUG
-if(dbg)printf("D: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
+if(dbg)printf("D: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
 #endif
 		/* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */
 #if FAC_DEBUG
-if(dbg)printf("On entry to (h<l): hi = %20llu\n",hi0);
-if(dbg)printf("l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1);
-if(dbg)printf("x = %20llu + 2^64* %20llu\n",x0.d0,x0.d1);
+if(dbg)printf("On entry to (h<l): hi = %20" PRIu64 "\n",hi0);
+if(dbg)printf("l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1);
+if(dbg)printf("x = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0,x0.d1);
 #endif
 		if(lo0.d1 != 0 || hi0 < lo0.d0){ SUB128(q0, lo0, x0);	x0.d0 +=  hi0; x0.d1 += (x0.d0 < hi0); } else { x0.d0 =  hi0 - lo0.d0; x0.d1 = (uint64)0; }
 		if(lo1.d1 != 0 || hi1 < lo1.d0){ SUB128(q1, lo1, x1);	x1.d0 +=  hi1; x1.d1 += (x1.d0 < hi1); } else { x1.d0 =  hi1 - lo1.d0; x1.d1 = (uint64)0; }
@@ -923,20 +923,20 @@ if(dbg)printf("x = %20llu + 2^64* %20llu\n",x0.d0,x0.d1);
 		if(lo6.d1 != 0 || hi6 < lo6.d0){ SUB128(q6, lo6, x6);	x6.d0 +=  hi6; x6.d1 += (x6.d0 < hi6); } else { x6.d0 =  hi6 - lo6.d0; x6.d1 = (uint64)0; }
 		if(lo7.d1 != 0 || hi7 < lo7.d0){ SUB128(q7, lo7, x7);	x7.d0 +=  hi7; x7.d1 += (x7.d0 < hi7); } else { x7.d0 =  hi7 - lo7.d0; x7.d1 = (uint64)0; }
 #if FAC_DEBUG
-if(dbg)printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1);
+if(dbg)printf("j = %2d, Res = %20" PRIu64 " + 2^64* %20" PRIu64,j,x0.d0,x0.d1);
 #endif
 
 		if((pshift >> j) & (uint64)1)
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)");
-			ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)");
-			ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)");
-			ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)");
-			ASSERT(HERE, CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)");
-			ASSERT(HERE, CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)");
-			ASSERT(HERE, CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)");
-			ASSERT(HERE, CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)");
+			ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)");
+			ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)");
+			ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)");
+			ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)");
+			ASSERT(CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)");
+			ASSERT(CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)");
+			ASSERT(CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)");
+			ASSERT(CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)");
 		#endif
 			ADD128(x0, x0, x0);
 			ADD128(x1, x1, x1);
@@ -956,7 +956,7 @@ if(dbg)printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1);
 			if(CMPULE128(q6, x6)) SUB128(x6, q6, x6);
 			if(CMPULE128(q7, x7)) SUB128(x7, q7, x7);
 #if FAC_DEBUG
-if(dbg)printf(" *2 = %20llu + 2^64* %20llu",x0.d0,x0.d1);
+if(dbg)printf(" *2 = %20" PRIu64 " + 2^64* %20" PRIu64,x0.d0,x0.d1);
 #endif
 		}
 #if FAC_DEBUG
@@ -993,7 +993,7 @@ if(dbg)printf("\n");
 	SUB128(x7, q7, x7);
 
 #if FAC_DEBUG
-if(dbg)printf("x0 = %20llu + 2^64* %20llu\n",x0.d0, x0.d1);
+if(dbg)printf("x0 = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0, x0.d1);
 #endif
 
 	/* Only do the full 128-bit (Xj== 1) check if the bottom 64 bits of Xj == 1: */
diff --git a/src/twopmodq160.c b/src/twopmodq160.c
index 9965c0f6..72cc519e 100755
--- a/src/twopmodq160.c
+++ b/src/twopmodq160.c
@@ -64,7 +64,7 @@ uint64 twopmodq160(uint64 *p_in, uint64 k)
 #if FAC_DEBUG
 if(dbg)printf("twopmodq160:\n");
 #endif
-	ASSERT(HERE, (p.d2 == 0) && (p.d1 >> 63) == 0, "p must be < 2^127!");
+	ASSERT((p.d2 == 0) && (p.d1 >> 63) == 0, "p must be < 2^127!");
 	ADD128(p,p, q);
 	q.d2 = mi64_mul_scalar((uint64 *)&q, k, (uint64 *)&q, 2);
 	q.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
@@ -94,7 +94,7 @@ if(dbg)printf("twopmodq160:\n");
 		{
 			j = leadz64(pshift.d2);	/* Remember, pshift is stored in a 192-bit... */
 		#if FAC_DEBUG
-			ASSERT(HERE, j >= 32,"twopmodq160: j >= 32");
+			ASSERT(j >= 32,"twopmodq160: j >= 32");
 		#endif
 			/* Extract leftmost 8 bits of pshift (if > 159, use the leftmost 7) and subtract from 160: */
 			lead8 = (((pshift.d2<<j) + (pshift.d1>>(64-j))) >> 56);	/* lead8 in [128, 255] */
@@ -149,7 +149,7 @@ if(dbg)printf("twopmodq160:\n");
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq160 : (q.d0 & (uint64)1) == 1");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq160 : (q.d0 & (uint64)1) == 1");
 #endif
 
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
@@ -172,14 +172,14 @@ if(dbg)printf("twopmodq160:\n");
 	MULL160(q, qinv, x);
 #if FAC_DEBUG
 	MULL192(q, qinv, y);	y.d2 &= 0x00000000ffffffff;
-	ASSERT(HERE, CMPEQ192(x, y), "twopmodq160: CMPEQ192(x, y)");
+	ASSERT(CMPEQ192(x, y), "twopmodq160: CMPEQ192(x, y)");
 	SUB160 (TWO160, y, y);
 	MULL192(y, qinv, y);	y.d2 &= 0x00000000ffffffff;
 #endif
 	SUB160 (TWO160, x, x);
 	MULL160(qinv, x, qinv);
 #if FAC_DEBUG
-	ASSERT(HERE, CMPEQ192(qinv, y), "twopmodq160: CMPEQ192(qinv, y)");
+	ASSERT(CMPEQ192(qinv, y), "twopmodq160: CMPEQ192(qinv, y)");
 #endif
 
 	MULL160(q, qinv, x);
@@ -211,7 +211,7 @@ printf("");
 #endif
 	MULH160(q,lo,lo);
 #if FAC_DEBUG
-	ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: CMPEQ192(lo,y)");
+	ASSERT(CMPEQ192(lo,y), "twopmodq160: CMPEQ192(lo,y)");
 	if(dbg) printf("q*lo/2^160 = %s\n", &char_buf[convert_uint192_base10_char(char_buf, lo)]);
 #endif
 
@@ -224,7 +224,7 @@ printf("");
 	if(TEST_BIT160(pshift, j))
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)");
+		ASSERT(CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)");
 	#endif
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(CMPUGT160(x, qhalf)){ ADD160(x, x, x); SUB160(x, q, x); }else{ ADD160(x, x, x); }
@@ -248,20 +248,20 @@ if(dbg) printf("2x= %s\n", &char_buf[convert_uint192_base10_char(char_buf, x)]);
 		SQR_LOHI192(x, y, z);
 		LSHIFT_FAST192(z,32,z);	z.d0 += (y.d2 >> 32);	/* x^2/2^160 */
 		y.d2 &= 0x00000000ffffffff;							/* x^2%2^160 */
-		ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: SQR_LOHI160: CMPEQ192(lo,y)");
-		ASSERT(HERE, CMPEQ192(hi,z), "twopmodq160: SQR_LOHI160: CMPEQ192(hi,z)");
+		ASSERT(CMPEQ192(lo,y), "twopmodq160: SQR_LOHI160: CMPEQ192(lo,y)");
+		ASSERT(CMPEQ192(hi,z), "twopmodq160: SQR_LOHI160: CMPEQ192(hi,z)");
 		y = lo;
 		MULL192(y, qinv, y);	y.d2 &= 0x00000000ffffffff;
 	#endif
 		MULL160(lo,qinv,lo);
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: MULL160: CMPEQ192(lo,y)");
+		ASSERT(CMPEQ192(lo,y), "twopmodq160: MULL160: CMPEQ192(lo,y)");
 		LSHIFT_FAST192(lo,32,y);	/* y = lo*2^32 */
 		MULH192(q,y,y);
 	#endif
 		MULH160(q,lo,lo);
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: MULH160: CMPEQ192(lo,y)");
+		ASSERT(CMPEQ192(lo,y), "twopmodq160: MULH160: CMPEQ192(lo,y)");
 	#endif
 
 		/* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */
@@ -281,7 +281,7 @@ if(dbg) printf("2x= %s\n", &char_buf[convert_uint192_base10_char(char_buf, x)]);
 		if(TEST_BIT160(pshift, j))
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)");
+			ASSERT(CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT160(x, qhalf)){ ADD160(x, x, x); SUB160(x, q, x); }else{ ADD160(x, x, x); }
diff --git a/src/twopmodq192.c b/src/twopmodq192.c
index c6f7c3db..6da8910f 100755
--- a/src/twopmodq192.c
+++ b/src/twopmodq192.c
@@ -106,7 +106,7 @@ uint192 twopmmodq192(uint192 p, uint192 q)
 	}
 #endif
 	// Find inverse (mod 2^192) of q; q must be odd for Montgomery-style modmul to work:
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmmodq192 : q must be odd for Montgomery-style modmul to work");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmmodq192 : q must be odd for Montgomery-style modmul to work");
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = qinv.d2 = 0ull;
 	/* Compute qinv  = q^-1 (mod R = 2^192) via Newton iteration qinv = qinv*(2 - q*qinv), starting with
@@ -222,7 +222,7 @@ uint192 twopmodq192(uint192 p, uint192 q)
 		mi64_div((uint64*)&qhalf,(uint64*)&p, 3,3, (uint64*)&x, (uint64*)&lo);	// x contains k; lo = (q-1)/2 % p
 	//	dbg = (x.d0 == 488) && (x.d1 == 0 && x.d2 == 0);
 	if(dbg) {
-		ASSERT(HERE, mi64_iszero((uint64*)&lo, 3), "k must divide (q-1)/2!");
+		ASSERT(mi64_iszero((uint64*)&lo, 3), "k must divide (q-1)/2!");
 		printf("twopmodq192:\n");
 	}
 	#endif
@@ -309,7 +309,7 @@ uint192 twopmodq192(uint192 p, uint192 q)
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #if FAC_DEBUG
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq192 : q must be odd for Montgomery-style modmul to work!");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq192 : q must be odd for Montgomery-style modmul to work!");
 #endif
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d2 = qinv.d1 = (uint64)0;
@@ -392,7 +392,7 @@ q*qinv*lo = |000 (192-x bits) 000||-------------------------------------- q*qinv
 	if(TEST_BIT192(pshift, j))
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)");
+		ASSERT(CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)");
 	#endif
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(CMPUGT192(x, qhalf)){ ADD192(x, x, x); SUB192(x, q, x); }else{ ADD192(x, x, x); }
@@ -432,7 +432,7 @@ q*qinv*lo = |000 (192-x bits) 000||-------------------------------------- q*qinv
 		if(TEST_BIT192(pshift, j))
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)");
+			ASSERT(CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT192(x, qhalf)){ ADD192(x, x, x); SUB192(x, q, x); }else{ ADD192(x, x, x); }
@@ -481,10 +481,10 @@ uint64 twopmodq192_q4(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 
 	// Use x0 as tmp to hold 2*p:
 	ADD192(p,p, x0);
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!");
 	q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 	q1.d0 += 1;
 	q2.d0 += 1;
@@ -647,7 +647,7 @@ uint64 twopmodq192_q4(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 	if(TEST_BIT192(pshift, j))
 	{
 	#if FAC_DEBUG
-		ASSERT(HERE, CMPULT192(x0,q0), "twopmodq192_q4: CMPULT192(x,q)");
+		ASSERT(CMPULT192(x0,q0), "twopmodq192_q4: CMPULT192(x,q)");
 	#endif
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(CMPUGT192(x0, qhalf0)){ ADD192(x0, x0, x0); SUB192(x0, q0, x0); }else{ ADD192(x0, x0, x0); }
@@ -708,7 +708,7 @@ uint64 twopmodq192_q4(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 		if(TEST_BIT192(pshift, j))
 		{
 		#if FAC_DEBUG
-			ASSERT(HERE, CMPULT192(x0,q0), "twopmodq192_q4 : CMPULT192(x,q)");
+			ASSERT(CMPULT192(x0,q0), "twopmodq192_q4 : CMPULT192(x,q)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT192(x0, qhalf0)){ ADD192(x0, x0, x0); SUB192(x0, q0, x0); }else{ ADD192(x0, x0, x0); }
@@ -774,7 +774,7 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr
 		__vout.d2 = __lo + __cy;\
 		__cw = __hi + (__vout.d2 < __lo);	/* carryout into cw */\
 		__lo = __vout.d2;	/* bw0 = z[len-1]; */\
-/*if(__k==900) {printf("Macro: bw0 = %20llu, cw = %20llu, z` = %s\n", __lo,__cw,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\
+/*if(__k==900) {printf("Macro: bw0 = %20" PRIu64 ", cw = %20" PRIu64 ", z` = %s\n", __lo,__cw,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\
 	/* 2. compute low n words of z = z' + y via vector-vector add, any carryout of that gets added to a 2nd copy of cw, cz: */\
 		/* mi64_add(y,z,z, len):	// z = z' + y */\
 		__vout.d0 = __vin.d0 + __vout.d0;\
@@ -789,9 +789,9 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr
 		__cy = (__hi < __vin.d2);\
 		__vout.d2 += __hi;\
 		__cy += (__vout.d2 < __hi);\
-/*if(__k==900) {printf("Macro: __vout.d2 [out] = %20llu\n", __vout.d2);}*/\
+/*if(__k==900) {printf("Macro: __vout.d2 [out] = %20" PRIu64 "\n", __vout.d2);}*/\
 		__cz = __cw + __cy;	/* cz = cw + mi64_add(y,z,z, len);	// z = z' + y */\
-/*if(__k==900) {printf("Macro: cz = %20llu, z = %s\n", __cz,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\
+/*if(__k==900) {printf("Macro: cz = %20" PRIu64 ", z = %s\n", __cz,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\
 \
 	/* 3. compute low n words of z >> (b-p), then separately shift in cz from the left, via (2^b*cz) >> (b-p) = (cz << p). */\
 		/* bw1 = mi64_shrl(z,z,nshift,len);	// low n words of z >> (b-p); high 64 bits of off-shifted portion saved in bw1 */\
@@ -818,7 +818,7 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr
 			/* Most-significant element gets zeros shifted in from the left: */\
 			__vout.d2 >>= __rembits;\
 		}\
-/*if(__k==900) {printf("Macro: bw1 = %20llu, z>> = %s\n", __hi,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\
+/*if(__k==900) {printf("Macro: bw1 = %20" PRIu64 ", z>> = %s\n", __hi,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\
 \
 		/* Check for borrow-on-subtract of to-be-off-shifted sections: */\
 		__bw = (__lo > __hi);\
@@ -869,7 +869,7 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr
 			__cy = __vout.d2 - __bw;\
 			__bw = (__cy > __vout.d2);\
 			__vout.d2 = __cy;\
-			ASSERT(HERE, !__bw, "bw != 0");\
+			ASSERT(!__bw, "bw != 0");\
 		}\
 	}
 
@@ -900,10 +900,10 @@ uint64 twopmodq192_q4_qmmp(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64
 	p.d0 = p_in[0]; p.d1 = p_in[1]; p.d2 = p_in[2];
 	// Use x0 as tmp to hold 2*p:
 	ADD192(p,p, x0);
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!");
 
 	q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 	q1.d0 += 1;
@@ -923,7 +923,7 @@ uint64 twopmodq192_q4_qmmp(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64
 		// Check that it's really a double-Mersenne: Adding one, right-shift by mmpsave = #bits give 1:
 		mi64_add_scalar(p_in, 1ull, (uint64*)&x, 3);
 		mi64_shrl((uint64*)&x, (uint64*)&x, mmpsave, 3,3);
-		--x.d0;	ASSERT(HERE, mi64_iszero((uint64*)&x, 3), "MMp check failed!");
+		--x.d0;	ASSERT(mi64_iszero((uint64*)&x, 3), "MMp check failed!");
 		x.d0 = 192; x.d1 = x.d2 = 0;
 		ADD192(p, x, pshift);
 		if(pshift.d2)
@@ -1082,10 +1082,10 @@ if(dbg) {
 	#endif
 
 	#if 1
-		MULH192_QMMP(x0,mmpsave,k0,lo0,3);	//MULH192(x0,q0,x);	ASSERT(HERE, CMPEQ192(lo0, x), "MULH192_QMMP fail!");
-		MULH192_QMMP(x1,mmpsave,k1,lo1,3);	//MULH192(x1,q1,x);	ASSERT(HERE, CMPEQ192(lo1, x), "MULH192_QMMP fail!");
-		MULH192_QMMP(x2,mmpsave,k2,lo2,3);	//MULH192(x2,q2,x);	ASSERT(HERE, CMPEQ192(lo2, x), "MULH192_QMMP fail!");
-		MULH192_QMMP(x3,mmpsave,k3,lo3,3);	//MULH192(x3,q3,x);	ASSERT(HERE, CMPEQ192(lo3, x), "MULH192_QMMP fail!");
+		MULH192_QMMP(x0,mmpsave,k0,lo0,3);	//MULH192(x0,q0,x);	ASSERT(CMPEQ192(lo0, x), "MULH192_QMMP fail!");
+		MULH192_QMMP(x1,mmpsave,k1,lo1,3);	//MULH192(x1,q1,x);	ASSERT(CMPEQ192(lo1, x), "MULH192_QMMP fail!");
+		MULH192_QMMP(x2,mmpsave,k2,lo2,3);	//MULH192(x2,q2,x);	ASSERT(CMPEQ192(lo2, x), "MULH192_QMMP fail!");
+		MULH192_QMMP(x3,mmpsave,k3,lo3,3);	//MULH192(x3,q3,x);	ASSERT(CMPEQ192(lo3, x), "MULH192_QMMP fail!");
 	#else
 		MULH192(x0,q0,lo0);
 		MULH192(x1,q1,lo1);
@@ -1159,14 +1159,14 @@ uint64 twopmodq192_q8(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3,
 
 	// Use x0 as tmp to hold 2*p:
 	ADD192(p,p, x0);
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k4, (uint64 *)&q4, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k5, (uint64 *)&q5, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k6, (uint64 *)&q6, 3), "q must be < 2^192!");
-	ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k7, (uint64 *)&q7, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k4, (uint64 *)&q4, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k5, (uint64 *)&q5, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k6, (uint64 *)&q6, 3), "q must be < 2^192!");
+	ASSERT(!mi64_mul_scalar((uint64 *)&x0, k7, (uint64 *)&q7, 3), "q must be < 2^192!");
 
 	q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 	q1.d0 += 1;
diff --git a/src/twopmodq256.c b/src/twopmodq256.c
index cc7a79d6..b42d349e 100755
--- a/src/twopmodq256.c
+++ b/src/twopmodq256.c
@@ -136,7 +136,7 @@ uint256 twopmmodq256(uint256 p, uint256 q)
 	}
 #endif
 	// Find inverse (mod 2^256) of q; q must be odd for Montgomery-style modmul to work:
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmmodq256 : q must be odd for Montgomery-style modmul to work");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmmodq256 : q must be odd for Montgomery-style modmul to work");
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = qinv.d2 = qinv.d3 = 0ull;
 	/* Compute qinv  = q^-1 (mod R = 2^256) via Newton iteration qinv = qinv*(2 - q*qinv), starting with
@@ -342,7 +342,7 @@ uint256 twopmodq256(uint256 p, uint256 q)
 #endif
 
 	/* Since zstart is a power of two < 2^256, use a streamlined code sequence for the first iteration: */
-	ASSERT(HERE, start_index>=2, "twopmodq256 : start_index < 2!");
+	ASSERT(start_index>=2, "twopmodq256 : start_index < 2!");
 	j = start_index-1;
 
 	/* MULL256(zstart,qinv,lo) simply amounts to a left-shift of the bits of qinv: */
diff --git a/src/twopmodq64_test.c b/src/twopmodq64_test.c
index dc4003e6..00461d36 100755
--- a/src/twopmodq64_test.c
+++ b/src/twopmodq64_test.c
@@ -113,7 +113,7 @@
 		x = x+x-q+FERMAT;	// In the case of interest, x = (q+1)/2 < 2^63, so x + x cannot overflow.
 		return (x==1);
 	#else	// ifndef __CUDA_ARCH__
-		ASSERT(HERE, 0, "Device code being called in host mode!");
+		ASSERT(0, "Device code being called in host mode!");
 		return 0;
 	#endif
 	}
@@ -246,7 +246,7 @@
 		r += (x3+x3-q3+FERMAT == 1);
 		return r;
 	#else	// ifndef __CUDA_ARCH__
-		ASSERT(HERE, 0, "Device code being called in host mode!");
+		ASSERT(0, "Device code being called in host mode!");
 		return 0;
 	#endif
 	}
@@ -327,7 +327,7 @@
 uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 {
 //int dbg = ( (p == (1ull<<32)) && ( (k0 == 2958ull) || (k1 == 2958ull) || (k2 == 2958ull) || (k3 == 2958ull) ) );
-//if(dbg) printf("Hit! k0-3 = %llu, %llu, %llu, %llu\n",k0, k1, k2, k3);
+//if(dbg) printf("Hit! k0-3 = %" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "\n",k0, k1, k2, k3);
 	int32 j;
 	uint64 r = (p<<1), q0 = 1+r*k0, q1 = 1+r*k1, q2 = 1+r*k2, q3 = 1+r*k3
 	, qinv0, qinv1, qinv2, qinv3
@@ -348,7 +348,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!");
 
 	// Compute 64-bit mod-inverses starting with 8-bits-good initializers:
 	uint32 q32_0,q32_1,q32_2,q32_3, qi32_0,qi32_1,qi32_2,qi32_3;
@@ -407,7 +407,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 		x2 = y2;
 		x3 = y3;
 	}
-//if(dbg) printf("x0 = %llu\n",x0);
+//if(dbg) printf("x0 = %" PRIu64 "\n",x0);
 
 //printf("twopmodq64_q4 : x1 = %s\n", &str0[convert_uint64_base10_char(str0, x1)] );
 //for(j = start_index-2; j >= 0; j--) {
@@ -553,7 +553,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3)
 //printf("twopmodq64_q4 : x1 = %s\n", &str0[convert_uint64_base10_char(str0, x1+x1-q1)] );
 //exit(0);
 	/*...Double and return.	These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */
-//if(dbg) printf("xout = %llu\n",x0+x0-q0+FERMAT);
+//if(dbg) printf("xout = %" PRIu64 "\n",x0+x0-q0+FERMAT);
 	r = 0;
 	if(x0+x0-q0+FERMAT == 1) r +=  1;
 	if(x1+x1-q1+FERMAT == 1) r +=  2;
@@ -586,7 +586,7 @@ uint64 twopmodq64_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6
 	pshift = ~pshift;
 
 	/* q must be odd for Montgomery-style modmul to work: */
-	ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
+	ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!");
 
 	// Compute 64-bit mod-inverses starting with 8-bits-good initializers:
 	uint32 q32_0,q32_1,q32_2,q32_3,q32_4,q32_5,q32_6,q32_7, qi32_0,qi32_1,qi32_2,qi32_3,qi32_4,qi32_5,qi32_6,qi32_7;
diff --git a/src/twopmodq80.c b/src/twopmodq80.c
index 5e334ee4..8fcac633 100755
--- a/src/twopmodq80.c
+++ b/src/twopmodq80.c
@@ -94,11 +94,11 @@ The key 3-operation sequence here is as follows:
 		int fidx;
 	#if FAC_DEBUG
 		if(dbg) {
-			printf("twopmodq78_3WORD_DOUBLE with p = %u, k = %llu, tid = %u\n",p,k,i);
+			printf("twopmodq78_3WORD_DOUBLE with p = %u, k = %" PRIu64 ", tid = %u\n",p,k,i);
 		}
 	#endif
 /*
-if(k == 7946076362870052)printf("In twopmodq78_3WORD_DOUBLE with i = %u, p = %u, k = %llu\n",i,p,k);
+if(k == 7946076362870052)printf("In twopmodq78_3WORD_DOUBLE with i = %u, p = %u, k = %" PRIu64 "\n",i,p,k);
 */
 		q.d0 = p+p;
 	#ifdef MUL_LOHI64_SUBROUTINE
@@ -138,7 +138,7 @@ if(k == 7946076362870052)printf("In twopmodq78_3WORD_DOUBLE with i = %u, p = %u,
 		MULH64(q.d0, qinv.d0, hi64);
 		qinv.d1 = -qinv.d0*(q.d1*qinv.d0 + hi64);
 		qinv.d1 &= 0x0000000000003fff;	/* Only want the lower 14 bits here */
-//	if(i == 0)printf("In twopmodq78_gpu with p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,k,zshift,start_index);
+//	if(i == 0)printf("In twopmodq78_gpu with p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,k,zshift,start_index);
 		/* Convert qinv to floating form: */
 		CVT_UINT78_3WORD_DOUBLE(qinv, fqinv0,fqinv1,fqinv2);
 	#if FAC_DEBUG
@@ -284,12 +284,12 @@ z0 = 6272576; z12 = 898312175313603; z=z0+a*z12	<*** z0 is +1 too large ***
 		SUB96(x,q,x);
 	#if FAC_DEBUG
 		if(dbg) {
-			printf("k = %llu: X_out = %u*2^64 + %llu\n", x.d1,x.d0);
+			printf("k = %" PRIu64 ": X_out = %u*2^64 + %" PRIu64 "\n", x.d1,x.d0);
 		}
 	#endif
 		return (CMPEQ96(x, ONE96));
 	#else	// ifndef __CUDA_ARCH__
-		ASSERT(HERE, 0, "Device code being called in host mode!");
+		ASSERT(0, "Device code being called in host mode!");
 		return 0;
 	#endif
 	}
@@ -629,7 +629,7 @@ z0 = 6272576; z12 = 898312175313603; z=z0+a*z12	<*** z0 is +1 too large ***
 		r += tmp3 << 3;
 		return r;
 	#else	// ifndef __CUDA_ARCH__
-		ASSERT(HERE, 0, "Device code being called in host mode!");
+		ASSERT(0, "Device code being called in host mode!");
 		return 0;
 	#endif
 	}
@@ -757,10 +757,10 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k)
 
 #if FAC_DEBUG
 	if(dbg) {
-		printf("%s with p = %llu, k = %llu\n",func,p,k);
+		printf("%s with p = %" PRIu64 ", k = %" PRIu64 "\n",func,p,k);
 	}
 #endif
-	ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!");
+	ASSERT((p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!");
 	q.d0 = p+p;
 #ifdef MUL_LOHI64_SUBROUTINE
 	// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -769,7 +769,7 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k)
 	MUL_LOHI64(q.d0, k, q.d0, q.d1);
 #endif
 	q.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
-	ASSERT(HERE, (q.d1 >> 14) == 0, "twopmodq78 : (q.d1 >> 14) != 0");
+	ASSERT((q.d1 >> 14) == 0, "twopmodq78 : (q.d1 >> 14) != 0");
 
 	/* Convert q to floating form: */
 	CVT_UINT78_3WORD_DOUBLE(q, fq0,fq1,fq2);
@@ -812,7 +812,7 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k)
 
 		pshift = ~pshift;
 	  #if FAC_DEBUG
-		if(dbg) printf("pshift = 0x%llX\n",pshift);
+		if(dbg) printf("pshift = %#" PRIX64 "\n",pshift);
 	  #endif
 	}
 
@@ -864,7 +864,7 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k)
 #endif
 	qinv.d1 &= 0x0000000000003fff;	/* Only want the lower 14 bits here */
 
-//	printf("twopmodq78_3WORD_DOUBLE with p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", (uint32)p,(uint32)pshift,k,zshift,start_index);
+//	printf("twopmodq78_3WORD_DOUBLE with p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", (uint32)p,(uint32)pshift,k,zshift,start_index);
 
 	/* Convert qinv to floating form: */
 /*	cvt_uint78_3word_double(qinv, &fqinv0,&fqinv1,&fqinv2);	*/
@@ -957,8 +957,8 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k)
 	CVT78_3WORD_DOUBLE_UINT96(fx0,fx1,fx2, x);
 
 if(~pshift != p+78) {
-	ASSERT(HERE, ~pshift > (p+78), "twopmodq80 : Only support pshift >= true value!");
-	ASSERT(HERE,(~pshift - (p+78)) < 32, "twopmodq80 : Only support pshift-diff < 32!");
+	ASSERT(~pshift > (p+78), "twopmodq80 : Only support pshift >= true value!");
+	ASSERT((~pshift - (p+78)) < 32, "twopmodq80 : Only support pshift-diff < 32!");
 	qmul  = fx0 + fx1*TWO26FLOAT;
 	qmul += fx2*TWO26FLOAT*TWO26FLOAT;
 	// Extra power of 2 is because in this flow we do not do the final 2*x-q step in the 'else' below:
@@ -988,7 +988,7 @@ if(~pshift != p+78) {
 	SUB96(x,lo,x);
 #if FAC_DEBUG
 	if(dbg) {
-		printf("X_out[A] = %u*2^64 + %llu\n", x.d1,x.d0);
+		printf("X_out[A] = %u*2^64 + %" PRIu64 "\n", x.d1,x.d0);
 	}
 #endif
 } else {
@@ -997,7 +997,7 @@ if(~pshift != p+78) {
 	SUB96(x,q,x);
 #if FAC_DEBUG
 	if(dbg) {
-		printf("X_out[B] = %u*2^64 + %llu\n", x.d1,x.d0);
+		printf("X_out[B] = %u*2^64 + %" PRIu64 "\n", x.d1,x.d0);
 	}
 #endif
 }
@@ -1010,25 +1010,25 @@ if(~pshift != p+78) {
 #ifdef USE_IMCI512
 
 	uint64 twopmodq78_3WORD_DOUBLE_q2(uint64 p, uint64 k0, uint64 k1, int init_sse2, int thr_id) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q2 cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q2 cannot be called for k1om builds!");	return 0;
 	}
 	uint64 twopmodq78_3WORD_DOUBLE_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, int init_sse2, int thr_id) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q4 cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q4 cannot be called for k1om builds!");	return 0;
 	}
 	uint64 twopmodq78_3WORD_DOUBLE_q4_REF(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q4_REF cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q4_REF cannot be called for k1om builds!");	return 0;
 	}
 	uint64 twopmodq78_3WORD_DOUBLE_q8(uint64 p, uint64 k[], int init_sse2, int thr_id) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q8 cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q8 cannot be called for k1om builds!");	return 0;
 	}
 	uint64 twopmodq78_3WORD_DOUBLE_q16(uint64 p, uint64 k[], int init_sse2, int thr_id) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q16 cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q16 cannot be called for k1om builds!");	return 0;
 	}
 	uint64 twopmodq78_3WORD_DOUBLE_q32(uint64 p, uint64 k[], int init_sse2, int thr_id) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q32 cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q32 cannot be called for k1om builds!");	return 0;
 	}
 	uint64 twopmodq78_3WORD_DOUBLE_q64(uint64 p, uint64 k[], int init_sse2, int thr_id) {
-		ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q64 cannot be called for k1om builds!");	return 0;
+		ASSERT(0,"twopmodq78_3WORD_DOUBLE_q64 cannot be called for k1om builds!");	return 0;
 	}
 
 #else
@@ -1118,7 +1118,7 @@ if(~pshift != p+78) {
 		double gq0,gq1,gq2, gqinv0,gqinv1,gqinv2, gx0,gx1,gx2, glo0,glo1,glo2, ghi0,ghi1,ghi2;
 		// Note: In ||-init mode, *value* of init_sse2 to store #threads-to-init-for:
 		if(init_sse2) {
-			ASSERT(HERE, init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!");
+			ASSERT(init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!");
 			return 0;	// In non-SIMD mode, ||-init call is a no-op
 		}
 
@@ -1126,7 +1126,7 @@ if(~pshift != p+78) {
 
 	#if FAC_DEBUG
 		if(dbg) {
-			printf("%s with p = %llu, k0 = %llu, k1 = %llu\n",func,p,k0,k1);
+			printf("%s with p = %" PRIu64 ", k0 = %" PRIu64 ", k1 = %" PRIu64 "\n",func,p,k0,k1);
 		}
 	#endif
 
@@ -1162,7 +1162,7 @@ if(~pshift != p+78) {
 
 			pshift = ~pshift;
 		  #if FAC_DEBUG
-			if(dbg) printf("pshift = 0x%llX\n",pshift);
+			if(dbg) printf("pshift = %#" PRIX64 "\n",pshift);
 		  #endif
 		}
 
@@ -1183,18 +1183,18 @@ if(~pshift != p+78) {
 			#endif
 				fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 			#ifndef COMPILER_TYPE_GCC
-				ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+				ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 			#endif
-				ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-				ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+				ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+				ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			}
 			if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 				free((void *)sc_arr);	sc_arr=0x0;
 			}
 			// Alloc the local-memory block:
-			sc_arr = ALLOC_DOUBLE(sc_arr, 0x2c*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_DOUBLE(sc_arr, 0x2c*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);	// Force vec_dbl-alignment
-			ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		#ifdef MULTITHREAD
 			__r0  = sc_ptr;
 			two13i  = sc_ptr + 0x18;
@@ -1252,7 +1252,7 @@ if(~pshift != p+78) {
 
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	  #ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		sc_ptr = __r0 + thr_id*0x2c;
 		/* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 2 to span an SSE register: */
 		fq0    = sc_ptr + 0x00;		gq0    = sc_ptr + 0x01;
@@ -1271,13 +1271,13 @@ if(~pshift != p+78) {
 		two26i = sc_ptr + 0x1c;
 		sse2_rnd=sc_ptr + 0x1e;
 		half   = sc_ptr + 0x20;
-	//	printf("Thr %d ONE96_PTR address = %llX; data.d0,d1 = %llu,%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1);
-		tmp = (vec_dbl*)sse2_rnd; ASSERT(HERE,(tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!");
+	//	printf("Thr %d ONE96_PTR address = %" PRIX64 "; data.d0,d1 = %" PRIu64 ",%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1);
+		tmp = (vec_dbl*)sse2_rnd; ASSERT((tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!");
 	  #endif
 
 	#endif
 
-		ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!");
 		q0.d0 = q1.d0 = p+p;
 	#ifdef MUL_LOHI64_SUBROUTINE
 		// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -1289,8 +1289,8 @@ if(~pshift != p+78) {
 	#endif
 		q0.d0 += 1;	/* Since 2*p*k even, no need to check for overflow here */
 		q1.d0 += 1;
-		ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q2 : (q0.d1 >> 14) != 0");
-		ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q2 : (q1.d1 >> 14) != 0");
+		ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q2 : (q0.d1 >> 14) != 0");
+		ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q2 : (q1.d1 >> 14) != 0");
 
 		/* Convert q to floating form: */
 	#ifdef USE_SSE2
@@ -1592,7 +1592,7 @@ if(~pshift != p+78) {
 	  #elif OS_BITS == 32
 
 		#error 32-bit OSes no longer supported for SIMD builds!
-		ASSERT(HERE, (uint32)(~pshift) == 0, "p+78 must be 32-bit here for 32-bit ASM support!");
+		ASSERT((uint32)(~pshift) == 0, "p+78 must be 32-bit here for 32-bit ASM support!");
 
 	  #else	// The 64-bit version of the macro is timing-suboptimal because I used it as a testbed:
 				// This 2-TF-input/4-xxm-register version serves as the basis for an 8-input version
@@ -1976,7 +1976,7 @@ if(~pshift != p+78) {
 	  #endif
 		int fidx,gidx,hidx,iidx;
 		if(init_sse2) {
-			ASSERT(HERE, init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!");
+			ASSERT(init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!");
 			return 0;	// In non-SIMD mode, ||-init call is a no-op
 		}
 
@@ -2021,18 +2021,18 @@ if(~pshift != p+78) {
 			#endif
 				fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 			#ifndef COMPILER_TYPE_GCC
-				ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+				ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 			#endif
-				ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-				ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+				ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+				ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			}
 			if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 				free((void *)sc_arr);	sc_arr=0x0;
 			}
 			// Alloc the local-memory block:
-			sc_arr = ALLOC_DOUBLE(sc_arr, 0x50*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			sc_arr = ALLOC_DOUBLE(sc_arr, 0x50*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);	// Force vec_dbl-alignment
-			ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		#ifdef MULTITHREAD
 			__r0  = sc_ptr;
 			two13i = sc_ptr + 0x40;
@@ -2095,7 +2095,7 @@ if(~pshift != p+78) {
 
 		/* If multithreaded, set the local-store pointers needed for the current thread; */
 	  #ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		sc_ptr = __r0 + thr_id*0x50;
 		/* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 2 to span an SSE register: */
 		fq0    = sc_ptr + 0x00;		gq0    = sc_ptr + 0x01;		hq0    = sc_ptr + 0x02;		iq0    = sc_ptr + 0x03;
@@ -2119,12 +2119,12 @@ if(~pshift != p+78) {
 		two26i = sc_ptr + 0x44;
 		sse2_rnd=sc_ptr + 0x46;
 		half   = sc_ptr + 0x48;
-		tmp = (vec_dbl*)sse2_rnd; ASSERT(HERE,(tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!");
+		tmp = (vec_dbl*)sse2_rnd; ASSERT((tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!");
 	  #endif
 
 	#endif
 
-		ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!");
 		q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p;
 	#ifdef MUL_LOHI64_SUBROUTINE
 		// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -2142,10 +2142,10 @@ if(~pshift != p+78) {
 		q1.d0 += 1;
 		q2.d0 += 1;
 		q3.d0 += 1;
-		ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0");
-		ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0");
-		ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0");
-		ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0");
+		ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0");
+		ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0");
+		ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0");
+		ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0");
 
 		q32_0 = (uint32)q0.d0;
 		q32_1 = (uint32)q1.d0;
@@ -3022,7 +3022,7 @@ if(~pshift != p+78) {
 		double iq0,iq1,iq2, iqinv0,iqinv1,iqinv2, ix0,ix1,ix2, ilo0,ilo1,ilo2, ihi0,ihi1,ihi2;
 		uint32 FERMAT = isPow2_64(p)<<1;	// *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case
 
-		ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!");
 		q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p;
 	#ifdef MUL_LOHI64_SUBROUTINE
 		// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -3040,10 +3040,10 @@ if(~pshift != p+78) {
 		q1.d0 += 1;
 		q2.d0 += 1;
 		q3.d0 += 1;
-		ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0");
-		ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0");
-		ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0");
-		ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0");
+		ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0");
+		ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0");
+		ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0");
+		ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0");
 
 		/* Convert q to floating form: */
 		CVT_UINT78_3WORD_DOUBLE(q0, fq0,fq1,fq2);
@@ -3319,7 +3319,7 @@ if(~pshift != p+78) {
 		// No TF support on ARMv8
 		uint64 twopmodq78_3WORD_DOUBLE_q8(uint64 p, uint64 k[], int init_sse2, int thr_id)
 		{
-			ASSERT(HERE,0,"No TF support on ARMv8!");
+			ASSERT(0,"No TF support on ARMv8!");
 		}
 
 	#elif defined(X64_ASM) && defined(USE_SSE2)
@@ -3409,18 +3409,18 @@ if(~pshift != p+78) {
 				#endif
 					fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 				#ifndef COMPILER_TYPE_GCC
-					ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+					ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 				#endif
-					ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-					ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+					ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+					ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 				}
 				if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 					free((void *)sc_arr);	sc_arr=0x0;
 				}
 				// Alloc the local-memory block - SSE2 needs 6 fewer double-slots than AVX (since only need one copy each of two13i,two26f,two26i), but use same AVX-alloc for both:
-				sc_arr = ALLOC_DOUBLE(sc_arr, 0x6c*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+				sc_arr = ALLOC_DOUBLE(sc_arr, 0x6c*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 				sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);	// Force vec_dbl-alignment
-				ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			#ifdef MULTITHREAD
 				__r0  = sc_ptr;
 			  #ifdef USE_AVX
@@ -3487,7 +3487,7 @@ if(~pshift != p+78) {
 
 			/* If multithreaded, set the local-store pointers needed for the current thread; */
 		#ifdef MULTITHREAD
-			ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+			ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 			sc_ptr = __r0 + thr_id*0x6c;
 			/* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */
 			aq0    = sc_ptr + 0x00;	bq0    = sc_ptr + 0x01;	cq0    = sc_ptr + 0x02;	dq0    = sc_ptr + 0x03;	eq0    = sc_ptr + 0x04;	fq0    = sc_ptr + 0x05;	gq0    = sc_ptr + 0x06;	hq0    = sc_ptr + 0x07;
@@ -3512,7 +3512,7 @@ if(~pshift != p+78) {
 		  #endif
 		#endif
 
-			ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q8 : p must be < 2^63!");
+			ASSERT((p >> 63) == 0, "twopmodq78_q8 : p must be < 2^63!");
 			q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p;
 		#ifdef MUL_LOHI64_SUBROUTINE
 			#error MUL_LOHI64_SUBROUTINE defined!
@@ -3534,14 +3534,14 @@ if(~pshift != p+78) {
 			q5.d0 += 1;
 			q6.d0 += 1;
 			q7.d0 += 1;
-			ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0");
-			ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0");
-			ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0");
-			ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0");
-			ASSERT(HERE, (q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0");
-			ASSERT(HERE, (q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0");
-			ASSERT(HERE, (q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0");
-			ASSERT(HERE, (q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0");
+			ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0");
+			ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0");
+			ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0");
+			ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0");
+			ASSERT((q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0");
+			ASSERT((q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0");
+			ASSERT((q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0");
+			ASSERT((q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0");
 
 			q32_0 = (uint32)q0.d0;
 			q32_1 = (uint32)q1.d0;
@@ -3995,9 +3995,9 @@ if(~pshift != p+78) {
 				zshift0 <<= 1;			zshift1 <<= 1;			/* In [0,76]/[36,112]; Doubling the shift count here takes cares of the first SQR_LOHI */
 				pshift = ~pshift;
 				/* 40 16-byte slots for floats, 16 for ints: */
-				sc_arr = ALLOC_VEC_DBL(sc_arr, 40+32);	ASSERT(HERE, sc_arr != 0x0, "ERROR: unable to allocate sc_arr!");
+				sc_arr = ALLOC_VEC_DBL(sc_arr, 40+32);	ASSERT(sc_arr != 0x0, "ERROR: unable to allocate sc_arr!");
 				sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);
-				ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 				/* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 2 to span an SSE register.
 				The bytewise address offsets of the left-column pointers (relative to base address fq0) are in the right comment-column: */
 																															/* Byte offset */
@@ -4036,7 +4036,7 @@ if(~pshift != p+78) {
 
 				/* Need both float and integer data to share same allocated chunk of memory, so can use a single base/offset scheme to manage both */
 				sm_ptr = (uint64*)(sc_ptr + 0x50);	/* Contiguous offset w.r.to last float data above is 0x4a, but start ints at +0x50 for ease: */
-				ASSERT(HERE, (uint32)sm_ptr == ((uint32)sc_ptr +  0x280), "sm_ptr not offset as expected!");
+				ASSERT((uint32)sm_ptr == ((uint32)sc_ptr +  0x280), "sm_ptr not offset as expected!");
 				/* Remember, these are pointers-to-uint128, so need an increment of 2 to span a memory slot: */											/* Byte offsets: */
 				qptr4  = (uint96*)(sm_ptr + 0x00);	qptr5  = (uint96*)(sm_ptr + 0x02);	qptr6  = (uint96*)(sm_ptr + 0x04);	qptr7  = (uint96*)(sm_ptr + 0x06);	/* 0x280 */
 				qinv4  = (uint96*)(sm_ptr + 0x08);	qinv5  = (uint96*)(sm_ptr + 0x0a);	qinv6  = (uint96*)(sm_ptr + 0x0c);	qinv7  = (uint96*)(sm_ptr + 0x0e);	/* 0x2c0 */
@@ -4048,7 +4048,7 @@ if(~pshift != p+78) {
 				ptr64 = (uint64*)ONE96_PTR;	*ptr64++ = ONE96.d0;	*ptr64-- = ONE96.d1;
 			}	/* first_entry */
 
-			ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+			ASSERT((p >> 63) == 0, "p must be < 2^63!");
 			q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p;
 			MUL_LOHI64(q0.d0, k[0], q0.d0, q0.d1);
 			MUL_LOHI64(q1.d0, k[1], q1.d0, q1.d1);
@@ -4067,14 +4067,14 @@ if(~pshift != p+78) {
 			q5.d0 += 1;
 			q6.d0 += 1;
 			q7.d0 += 1;
-			ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0");
-			ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0");
-			ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0");
-			ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0");
-			ASSERT(HERE, (q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0");
-			ASSERT(HERE, (q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0");
-			ASSERT(HERE, (q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0");
-			ASSERT(HERE, (q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0");
+			ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0");
+			ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0");
+			ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0");
+			ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0");
+			ASSERT((q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0");
+			ASSERT((q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0");
+			ASSERT((q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0");
+			ASSERT((q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0");
 
 			/*****************************************************************************************************/
 			/*** From here onward, q0-3 get processed via 78-bit float-based modmul, q4-7 via 96-bit pure-int: ***/
@@ -4866,7 +4866,7 @@ if(~pshift != p+78) {
 					*two13i, *two26f,*two26i, *two52f,*two52i;
 
 		#if FAC_DEBUG
-			if(dbg) printf("%s with p = %llu, k[] = %llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n",
+			if(dbg) printf("%s with p = %" PRIu64 ", k[] = %" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",
 						func,p,k[0x0],k[0x1],k[0x2],k[0x3],k[0x4],k[0x5],k[0x6],k[0x7],k[0x8],k[0x9],k[0xa],k[0xb],k[0xc],k[0xd],k[0xe],k[0xf]);
 		#endif
 			if(p != psave)
@@ -4890,7 +4890,7 @@ if(~pshift != p+78) {
 								// Result in [0,76], i.e. qinv << (zshift<<1) always has at least the leading bit set.
 				pshift = ~pshift;
 			  #if FAC_DEBUG
-				if(dbg) printf("pshift = 0x%llX\n",pshift);
+				if(dbg) printf("pshift = %#" PRIX64 "\n",pshift);
 			  #endif
 			}
 
@@ -4909,18 +4909,18 @@ if(~pshift != p+78) {
 				#endif
 					fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 				#ifndef COMPILER_TYPE_GCC
-					ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+					ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 				#endif
-					ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-					ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+					ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+					ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 				}
 				if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 					free((void *)sc_arr);	sc_arr=0x0;
 				}
 				// Alloc the local-memory block:
-				sc_arr = ALLOC_DOUBLE(sc_arr, 0xfc*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+				sc_arr = ALLOC_DOUBLE(sc_arr, 0xfc*max_threads + 4);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 				sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);	// Force vec_dbl-alignment
-				ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			#ifdef MULTITHREAD
 				__r0  = sc_ptr;
 				two13i = sc_ptr + 0xc0;
@@ -4988,7 +4988,7 @@ if(~pshift != p+78) {
 
 			/* If multithreaded, set the local-store pointers needed for the current thread; */
 		#ifdef MULTITHREAD
-			ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+			ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 			sc_ptr = __r0 + thr_id*0xfc;
 			/* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */
 			fq0   [0] = sc_ptr + 0x00;
@@ -5029,7 +5029,7 @@ if(~pshift != p+78) {
 		#ifdef MUL_LOHI64_SUBROUTINE
 			#error MUL_LOHI64_SUBROUTINE defined!
 		#endif
-			ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q16: p must be < 2^63!");
+			ASSERT((p >> 63) == 0, "twopmodq78_q16: p must be < 2^63!");
 			for(j = 0; j < 16; j++)
 			{
 				q[j].d0 = p+p;
@@ -5192,7 +5192,7 @@ if(~pshift != p+78) {
 					: "cc","memory","rax","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6", "xmm8","xmm9","xmm10", "xmm12","xmm13","xmm14"	/* Clobbered registers */\
 				);
 			} else {
-				ASSERT(HERE,0,"zshift out of range!");
+				ASSERT(0,"zshift out of range!");
 			}
 		//	VEC_DBL_INIT_4((vec_dbl*)fx0[0], dtmp);	This wound up using zmm0 in some of my builds, so moved this init inside the iter1 macro below
 
@@ -5237,7 +5237,7 @@ if(~pshift != p+78) {
 
 		#if FAC_DEBUG
 		  if(dbg) {
-			printf("p = %llu, k0 = %llu, start_index0 = %u, initial shift = %u\n",p,k[0],start_index,zshift);
+			printf("p = %" PRIu64 ", k0 = %" PRIu64 ", start_index0 = %u, initial shift = %u\n",p,k[0],start_index,zshift);
 			printf("On modpow-loop entry: start_index = %u,\n\tfx0-2 = %20.15f, %20.15f, %20.15f, %20.15f\n",start_index, *fx0[0],*fx1[0],*fx2[0]);
 		  }
 		#endif
@@ -5359,7 +5359,7 @@ if(~pshift != p+78) {
 
 		#if FAC_DEBUG
 			if(dbg) {
-				printf("xout_q16 = %llX\n",r);
+				printf("xout_q16 = %" PRIX64 "\n",r);
 				exit(0);
 			}
 		#endif
@@ -5386,7 +5386,7 @@ if(~pshift != p+78) {
 			static uint64 psave = 0, pshift;
 			static uint32 start_index, zshift, first_entry = TRUE;
 			uint32 FERMAT = isPow2_64(p)<<1;	// *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case
-			uint8* minv8_ptr = minv8;	// Ptr to Table of precomputed byte-inverses def'd in mi64.h
+			const uint8* minv8_ptr = minv8;	// Ptr to Table of precomputed byte-inverses def'd in mi64.h
 			static int max_threads = 1;	// Default local-array-init is for just a single thread ... caller can re-init for > 1 threads later, if desired.
 		#ifdef USE_AVX512_I
 		  #error AVX-512 IFMA instruction extensions not yet supported!
@@ -5402,7 +5402,7 @@ if(~pshift != p+78) {
 			uint64 *fq0[32],*fq1[32],*fq2[32],*fqhi52[32], *fqinv0[32],*fqinv1[32],*fqinv2[32], *fx0[32],*fx1[32],*fx2[32],
 					*mask_lo26,*mask_lo52;
 			for(j = 0; j < 32; j++) {
-				ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!");
+				ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!");
 			}
 		#else
 			static double *sc_arr = 0x0, *sc_ptr;
@@ -5418,7 +5418,7 @@ if(~pshift != p+78) {
 					kdbl[32];
 			// AVX-512 Foundation lacks the needed DQ extensions, so use HLL to convert kvec entries to double:
 			for(j = 0; j < 32; j++) {
-				ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!");
+				ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!");
 				kdbl[j] = (double)k[j];
 			}
 		#endif
@@ -5458,10 +5458,10 @@ if(~pshift != p+78) {
 				#endif
 					fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 				#ifndef COMPILER_TYPE_GCC
-					ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+					ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 				#endif
-					ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-					ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+					ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+					ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 				}
 				if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 					free((void *)sc_arr);	sc_arr=0x0;
@@ -5469,9 +5469,9 @@ if(~pshift != p+78) {
 				// Alloc the local-memory block the #bytes multiplier has plenty of extra room built in, e.g. for debug-data-writes:
 			#ifdef USE_AVX512_I
 
-				sc_arr = ALLOC_UINT64(sc_arr, 0x1c0*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+				sc_arr = ALLOC_UINT64(sc_arr, 0x1c0*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 				sc_ptr = (uint64 *)ALIGN_VEC_U64(sc_arr);	// Force vec_u64-alignment
-				ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			  #ifdef MULTITHREAD
 				__r0  = sc_ptr;
 				mask_lo26 = sc_ptr + 0x180;
@@ -5522,9 +5522,9 @@ if(~pshift != p+78) {
 			#else	// Default AVX-512 floating-point-FMA mode
 			/***************************************************/
 
-				sc_arr = ALLOC_DOUBLE(sc_arr, 0x1c0*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+				sc_arr = ALLOC_DOUBLE(sc_arr, 0x1c0*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 				sc_ptr = (uint64 *)ALIGN_VEC_DBL(sc_arr);	// Force vec_u64-alignment
-				ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			  #ifdef MULTITHREAD
 				__r0  = sc_ptr;
 				two13i = sc_ptr + 0x180;
@@ -5593,7 +5593,7 @@ if(~pshift != p+78) {
 			/* If multithreaded, set the local-store pointers needed for the current thread; */
 		#ifdef MULTITHREAD
 
-			ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+			ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 			sc_ptr = __r0 + thr_id*0x1c0;
 
 		  #ifdef USE_AVX512_I
@@ -5669,7 +5669,7 @@ if(~pshift != p+78) {
 		#ifdef MUL_LOHI64_SUBROUTINE
 			#error MUL_LOHI64_SUBROUTINE defined!
 		#endif
-			ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q32: p must be < 2^63!");
+			ASSERT((p >> 63) == 0, "twopmodq78_q32: p must be < 2^63!");
 
 		#ifdef USE_AVX512_I
 
@@ -5987,7 +5987,7 @@ if(~pshift != p+78) {
 			} else if(zshift < 78) {
 				dtmp = 1<<(zshift-52);	for(j = 0; j < 32; j += 8) { VEC_DBL_INIT_8((vec_dbl*)fx0[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx1[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx2[j],dtmp); }
 			} else {
-				ASSERT(HERE,0,"zshift out of range!");
+				ASSERT(0,"zshift out of range!");
 			}
 
 			/*...x^2 mod q is returned in x. */
@@ -6133,7 +6133,7 @@ if(~pshift != p+78) {
 			static uint64 psave = 0, pshift;
 			static uint32 start_index, zshift, first_entry = TRUE;
 			uint32 FERMAT = isPow2_64(p)<<1;	// *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case
-			uint8* minv8_ptr = minv8;	// Ptr to Table of precomputed byte-inverses def'd in mi64.h
+			const uint8* minv8_ptr = minv8;	// Ptr to Table of precomputed byte-inverses def'd in mi64.h
 			static int max_threads = 1;	// Default local-array-init is for just a single thread ... caller can re-init for > 1 threads later, if desired.
 		#ifdef USE_AVX512_I
 		  #error AVX-512 IFMA instruction extensions not yet supported!
@@ -6149,7 +6149,7 @@ if(~pshift != p+78) {
 			uint64 *fq0[64],*fq1[64],*fq2[64],*fqhi52[64], *fqinv0[64],*fqinv1[64],*fqinv2[64], *fx0[64],*fx1[64],*fx2[64],
 					*mask_lo26,*mask_lo52;
 			for(j = 0; j < 64; j++) {
-				ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!");
+				ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!");
 			}
 		#else
 			static double *sc_arr = 0x0, *sc_ptr;
@@ -6165,7 +6165,7 @@ if(~pshift != p+78) {
 					kdbl[64];
 			// AVX-512 Foundation lacks the needed DQ extensions, so use HLL to convert kvec entries to double:
 			for(j = 0; j < 64; j++) {
-				ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!");
+				ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!");
 				kdbl[j] = (double)k[j];
 			}
 		#endif
@@ -6205,10 +6205,10 @@ if(~pshift != p+78) {
 				#endif
 					fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads);
 				#ifndef COMPILER_TYPE_GCC
-					ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+					ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 				#endif
-					ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-					ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+					ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+					ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 				}
 				if(sc_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 					free((void *)sc_arr);	sc_arr=0x0;
@@ -6216,9 +6216,9 @@ if(~pshift != p+78) {
 				// Alloc the local-memory block the #bytes multiplier has plenty of extra room built in, e.g. for debug-data-writes:
 			#ifdef USE_AVX512_I
 
-				sc_arr = ALLOC_UINT64(sc_arr, 0x380*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+				sc_arr = ALLOC_UINT64(sc_arr, 0x380*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 				sc_ptr = (uint64 *)ALIGN_VEC_U64(sc_arr);	// Force vec_u64-alignment
-				ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			  #ifdef MULTITHREAD
 				__r0  = sc_ptr;
 				mask_lo26 = sc_ptr + 0x300;
@@ -6269,9 +6269,9 @@ if(~pshift != p+78) {
 			#else	// Default AVX-512 floating-point-FMA mode
 			/***************************************************/
 
-				sc_arr = ALLOC_DOUBLE(sc_arr, 0x380*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+				sc_arr = ALLOC_DOUBLE(sc_arr, 0x380*max_threads);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 				sc_ptr = (uint64 *)ALIGN_VEC_DBL(sc_arr);	// Force vec_u64-alignment
-				ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+				ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			  #ifdef MULTITHREAD
 				__r0  = sc_ptr;
 				two13i = sc_ptr + 0x300;
@@ -6340,7 +6340,7 @@ if(~pshift != p+78) {
 			/* If multithreaded, set the local-store pointers needed for the current thread; */
 		#ifdef MULTITHREAD
 
-			ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+			ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 			sc_ptr = __r0 + thr_id*0x380;
 
 		  #ifdef USE_AVX512_I
@@ -6416,7 +6416,7 @@ if(~pshift != p+78) {
 		#ifdef MUL_LOHI64_SUBROUTINE
 			#error MUL_LOHI64_SUBROUTINE defined!
 		#endif
-			ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q64: p must be < 2^63!");
+			ASSERT((p >> 63) == 0, "twopmodq78_q64: p must be < 2^63!");
 
 		#ifdef USE_AVX512_I
 
@@ -6777,7 +6777,7 @@ if(~pshift != p+78) {
 			} else if(zshift < 78) {
 				dtmp = 1<<(zshift-52);	for(j = 0; j < 64; j += 8) { VEC_DBL_INIT_8((vec_dbl*)fx0[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx1[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx2[j],dtmp); }
 			} else {
-				ASSERT(HERE,0,"zshift out of range!");
+				ASSERT(0,"zshift out of range!");
 			}
 
 			/*...x^2 mod q is returned in x. */
diff --git a/src/twopmodq80.h b/src/twopmodq80.h
index 38f83019..f14b5de1 100755
--- a/src/twopmodq80.h
+++ b/src/twopmodq80.h
@@ -1866,7 +1866,7 @@ to balanced-digit floating-point form. Outputs have the following size ranges:
 	uint64 __tmp64;\
 	int64 __itmp, __cy;\
 	\
-	DBG_ASSERT(HERE, (__x.d1 >> 14) == 0, "Input > 78-bit limit!");\
+	DBG_ASSERT((__x.d1 >> 14) == 0, "Input > 78-bit limit!");\
 	\
 	/* Digit 0: */\
 	__tmp64 = __x.d0;\
@@ -1892,7 +1892,7 @@ to balanced-digit floating-point form. Outputs have the following size ranges:
 	/* No balanced-digit normalization of MSW: */\
 	__fword2 = (double)__tmp64;\
 	\
-	DBG_ASSERT(HERE, __fword2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
+	DBG_ASSERT(__fword2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
 }
 
 /* Converts a 78-bit unsigned input __x (stored in balanced-digit
@@ -1907,7 +1907,7 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized.
 	if(__itmp < 0)	/* If current digit < 0, add the base and set carry = -1	*/\
 	{\
 		__itmp += TWO26FLOAT;\
-		DBG_ASSERT(HERE, __itmp >= 0, "Normalized digit still < 0!");\
+		DBG_ASSERT(__itmp >= 0, "Normalized digit still < 0!");\
 		__cy = -1;\
 	}\
 	else\
@@ -1921,7 +1921,7 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized.
 	if(__itmp < 0)\
 	{\
 		__itmp += TWO26FLOAT;\
-		DBG_ASSERT(HERE, __itmp >= 0, "Normalized digit still < 0!");\
+		DBG_ASSERT(__itmp >= 0, "Normalized digit still < 0!");\
 		__cy = -1;\
 	}\
 	else\
@@ -1935,7 +1935,7 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized.
 	if(__itmp < 0)\
 	{\
 		__itmp += TWO26FLOAT;\
-		DBG_ASSERT(HERE, __itmp >= 0, "Normalized digit still < 0!");\
+		DBG_ASSERT(__itmp >= 0, "Normalized digit still < 0!");\
 		__cy = -1;\
 	}\
 	else\
@@ -1945,8 +1945,8 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized.
 	__x.d0 += ((uint64)__itmp << 52);\
 	__x.d1  = ((uint64)__itmp >> 12) & 0x0000000000003fff;	/* Only case where we really need the (uint64) cast */\
 	\
-	DBG_ASSERT(HERE, (__x.d1 >> 14) == 0, "Output > 78-bit limit!");\
-	DBG_ASSERT(HERE,  __cy          == 0, "Nonzero exit carry!");\
+	DBG_ASSERT((__x.d1 >> 14) == 0, "Output > 78-bit limit!");\
+	DBG_ASSERT(__cy          == 0, "Nonzero exit carry!");\
 }
 
 /* Takes a 78-bit unsigned input __x stored in balanced-digit floating-point form
@@ -1968,8 +1968,8 @@ and renormalizes with respect to the balanced-digit base.
 	/* Digit 2: */\
 	__x2 += __fcy;\
 	\
-	DBG_ASSERT(HERE, __x2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __x2 >= 0         , "MSW < 0!");\
+	DBG_ASSERT(__x2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
+	DBG_ASSERT(__x2 >= 0         , "MSW < 0!");\
 }
 
 /* Takes a 156-bit unsigned input __x stored in balanced-digit floating-point form
@@ -2013,10 +2013,10 @@ separately, we require the MSW of each to be nonnegative, i.e. we don't balance
 	/* Digit 5: */\
 	__x5 += __fcy;\
 	\
-	DBG_ASSERT(HERE, __x2 >= 0         , "_x2 < 0!");\
-	DBG_ASSERT(HERE, __x2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __x5 >= 0         , "MSW < 0!");\
-	DBG_ASSERT(HERE, __x5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
+	DBG_ASSERT(__x2 >= 0         , "_x2 < 0!");\
+	DBG_ASSERT(__x2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\
+	DBG_ASSERT(__x5 >= 0         , "MSW < 0!");\
+	DBG_ASSERT(__x5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
 }
 
 /**********************************************************************************/
@@ -2069,9 +2069,9 @@ separately, we require the MSW of each to be nonnegative, i.e. we don't balance
 	double __fcy;\
 	uint32 __itmp;\
 	\
-	DBG_ASSERT(HERE, __fx0 < TWO26FLOAT, "x0 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __fx1 < TWO26FLOAT, "x1 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __fx2 < TWO26FLOAT, "x2 > TWO26FLOAT");\
+	DBG_ASSERT(__fx0 < TWO26FLOAT, "x0 > TWO26FLOAT");\
+	DBG_ASSERT(__fx1 < TWO26FLOAT, "x1 > TWO26FLOAT");\
+	DBG_ASSERT(__fx2 < TWO26FLOAT, "x2 > TWO26FLOAT");\
 	\
 	/* Digit 0: */\
 	__fprod0  =  __fx0*__fx0;\
@@ -2105,10 +2105,10 @@ separately, we require the MSW of each to be nonnegative, i.e. we don't balance
 	/* Digit 5: */\
 	__fprod5  = __fcy;\
 	\
-	DBG_ASSERT(HERE, __fprod2 >= 0         , "_x2 < 0!");\
-	DBG_ASSERT(HERE, __fprod2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __fprod5 >= 0         , "MSW < 0!");\
-	DBG_ASSERT(HERE, __fprod5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
+	DBG_ASSERT(__fprod2 >= 0         , "_x2 < 0!");\
+	DBG_ASSERT(__fprod2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\
+	DBG_ASSERT(__fprod5 >= 0         , "MSW < 0!");\
+	DBG_ASSERT(__fprod5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\
 }
 
 #ifdef __CUDACC__
@@ -2483,7 +2483,7 @@ double __mo52 = fma(__y1,TWO26FLOAT,__y0);	/* mo52 = y0 + y1*2^26 */\
 		__itmp   = (__lo2 < 0);\
 		__lo2   += (double)(__itmp << 26);\
 		/* Require output to be nonnegative, so leave MSW unbalanced: */\
-		DBG_ASSERT(HERE, __lo2 >= 0, "MSW < 0!");\
+		DBG_ASSERT(__lo2 >= 0, "MSW < 0!");\
 	}
 
 #endif
@@ -2564,13 +2564,13 @@ we code so that any or all of __X, __Y and __LO may have the same addresses.
 	double __fcy, __tmp, __prod3, __prod4;\
 	uint32 __itmp;\
 	\
-	DBG_ASSERT(HERE, __x0 < TWO26FLOAT, "x0 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __x1 < TWO26FLOAT, "x1 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __x2 < TWO26FLOAT, "x2 > TWO26FLOAT");\
+	DBG_ASSERT(__x0 < TWO26FLOAT, "x0 > TWO26FLOAT");\
+	DBG_ASSERT(__x1 < TWO26FLOAT, "x1 > TWO26FLOAT");\
+	DBG_ASSERT(__x2 < TWO26FLOAT, "x2 > TWO26FLOAT");\
 	\
-	DBG_ASSERT(HERE, __y0 < TWO26FLOAT, "y0 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __y1 < TWO26FLOAT, "y1 > TWO26FLOAT");\
-	DBG_ASSERT(HERE, __y2 < TWO26FLOAT, "y2 > TWO26FLOAT");\
+	DBG_ASSERT(__y0 < TWO26FLOAT, "y0 > TWO26FLOAT");\
+	DBG_ASSERT(__y1 < TWO26FLOAT, "y1 > TWO26FLOAT");\
+	DBG_ASSERT(__y2 < TWO26FLOAT, "y2 > TWO26FLOAT");\
 	\
 	/* Digit 0: */\
 	__tmp  =  __x0*__y0;\
@@ -2610,7 +2610,7 @@ we code so that any or all of __X, __Y and __LO may have the same addresses.
 	/* Digit 5: */\
 	__hi2    = __fcy;\
 	\
-	DBG_ASSERT(HERE, __hi2 >= 0, "MSW < 0!");\
+	DBG_ASSERT(__hi2 >= 0, "MSW < 0!");\
 }
 
 #ifdef __CUDACC__
@@ -2892,7 +2892,7 @@ out128.d0 = (uint64)__fcy; out128.d1 = (uint64)0ull;\
 ADD128(j128,out128,out128);\
 lo26 = out128.d0 & 0x0000000003FFFFFFull;\
 RSHIFT128(out128, 26, out128);\
-fprintf(stderr,"exact<52:77>, <78:129> = %20llu, %20llu\n",out128.d0,out128.d1);\
+fprintf(stderr,"exact<52:77>, <78:129> = %20" PRIu64 ", %20" PRIu64 "\n",out128.d0,out128.d1);\
 }
 */
 	#define MULH78_3WORD_DOUBLE_q4(\
@@ -2935,7 +2935,7 @@ out128.d0 = (uint64)__fcy; out128.d1 = (uint64)0ull;\
 ADD128(j128,out128,out128);\
 lo26 = out128.d0 & 0x0000000003FFFFFFull;\
 RSHIFT128(out128, 26, out128);\
-fprintf(stderr,"exact<52:77>, <78:129> = %20llu, %20llu\n",out128.d0,out128.d1);\
+fprintf(stderr,"exact<52:77>, <78:129> = %20" PRIu64 ", %20" PRIu64 "\n",out128.d0,out128.d1);\
 		__ftmp = __fx52*__fy2+__fx2*__fy52+__fcy;\
 		__gtmp = __gx52*__gy2+__gx2*__gy52+__gcy;\
 		__htmp = __hx52*__hy2+__hx2*__hy52+__hcy;\
diff --git a/src/twopmodq96.c b/src/twopmodq96.c
index 7c1a62d4..19bb15b8 100755
--- a/src/twopmodq96.c
+++ b/src/twopmodq96.c
@@ -118,7 +118,7 @@
 		/*...Double and return.	These are specialized for the case
 		where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2.
 		*/
-//(p == 18276023 && k == 760542841672ull)printf("q = [%u,%llu], x = [%u,%llu]\n",q.d1,q.d0, x.d1,x.d0);
+//(p == 18276023 && k == 760542841672ull)printf("q = [%u,%" PRIu64 "], x = [%u,%" PRIu64 "]\n",q.d1,q.d0, x.d1,x.d0);
 	  #if 1				// I should read my own comments ... since x = (q+1)/2 implies divisibility can replace this...
 		ADD96(x,x,x);	/* In the case of interest, x = (q+1)/2 < 2^95, so x + x cannot overflow. */
 		q.d0 -= FERMAT;
@@ -129,7 +129,7 @@
 		return (x.d1 == qhalf.d1 && x.d0 == (qhalf.d0+1));
 	  #endif
 	#else	// ifndef __CUDA_ARCH__
-		ASSERT(HERE, 0, "Device code being called in host mode!");
+		ASSERT(0, "Device code being called in host mode!");
 		return 0;
 	#endif
 	}
@@ -229,7 +229,7 @@ uint96 twopmodq96(uint64 p, uint64 k)
 #ifdef FAC_DEBUG
 if(dbg)printf("twopmodq96:\n");
 #endif
-	ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+	ASSERT((p >> 63) == 0, "p must be < 2^63!");
 	q.d0 = p+p;
 #ifdef MUL_LOHI64_SUBROUTINE
 	// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -246,7 +246,7 @@ if(dbg)printf("twopmodq96:\n");
 	*/
 	/* q must be odd for Montgomery-style modmul to work: */
 #ifdef FAC_DEBUG
-	ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq96 : (q.d0 & (uint64)1) == 1");
+	ASSERT((q.d0 & (uint64)1) == 1, "twopmodq96 : (q.d0 & (uint64)1) == 1");
 #endif
 	/* Init qinv = q. We're really only interested in the bottom 2 bits of q. */
 	qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2;	qinv.d1 = (uint64)0;
@@ -287,7 +287,7 @@ if(dbg)printf("twopmodq96:\n");
 	MULL96(q, qinv, x);			MULL128(q, qinv, y);
 	SUB96 (TWO96, x, x);		SUB128 (TWO128, y, y);
 	MULL96(qinv, x, x);			MULL128(qinv, y, y);
-	ASSERT(HERE, x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0, "x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0");
+	ASSERT(x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0, "x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0");
 #endif
 	/* qinv has 96 bits, but only the upper 32 get modified here. */
 #ifdef MUL_LOHI64_SUBROUTINE
@@ -299,7 +299,7 @@ if(dbg)printf("twopmodq96:\n");
 	qinv.d1 &= 0x00000000ffffffff;	/* Only want the lower 32 bits here */
 
 #ifdef FAC_DEBUG
-	ASSERT(HERE, qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq96 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
+	ASSERT(qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq96 : qinv.d1 == x.d1 && qinv.d0 == x.d0");
 	if(dbg) printf("q    = %s\n", &char_buf[convert_uint96_base10_char(char_buf, q   )]);
 	if(dbg) printf("qinv = %s\n", &char_buf[convert_uint96_base10_char(char_buf, qinv)]);
 #endif
@@ -333,7 +333,7 @@ if(dbg)printf("twopmodq96:\n");
 	if((pshift >> j) & (uint64)1)
 	{
 	#ifdef FAC_DEBUG
-		ASSERT(HERE, CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)");
+		ASSERT(CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)");
 	#endif
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(CMPUGT96(x, qhalf)){ ADD96(x, x, x); SUB96(x, q, x); }else{ ADD96(x, x, x); }
@@ -369,15 +369,15 @@ if(dbg)printf("twopmodq96:\n");
 			SUB96(q, lo, lo);
 			ADD96(lo, hi, x);
 #ifdef FAC_DEBUG
-	if(dbg) printf("q-l   = %10u, %20llu\n", lo.d1, lo.d0);
-	if(dbg) printf("q-l+h = %10u, %20llu\n",  x.d1,  x.d0);
+	if(dbg) printf("q-l   = %10u, %20" PRIu64 "\n", lo.d1, lo.d0);
+	if(dbg) printf("q-l+h = %10u, %20" PRIu64 "\n",  x.d1,  x.d0);
 #endif
 		}
 		else
 		{
 			SUB96(hi, lo, x);
 #ifdef FAC_DEBUG
-	if(dbg) printf("q=h-l = %10u, %20llu\n",  x.d1,  x.d0);
+	if(dbg) printf("q=h-l = %10u, %20" PRIu64 "\n",  x.d1,  x.d0);
 #endif
 		}
 
@@ -388,7 +388,7 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint96_base10_char(char_buf
 		if((pshift >> j) & (uint64)1)
 		{
 		#ifdef FAC_DEBUG
-			ASSERT(HERE, CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)");
+			ASSERT(CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT96(x, qhalf)){ ADD96(x, x, x); SUB96(x, q, x); }else{ ADD96(x, x, x); }
@@ -479,24 +479,24 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 			#endif
 				fprintf(stderr, "twopmodq96_q4: Setting up for as many as %d threads...\n",max_threads);
 			#ifndef COMPILER_TYPE_GCC
-				ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+				ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 			#endif
-				ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-				ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+				ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+				ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			}
 			if(sm_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 				free((void *)sm_arr);	sm_arr=0x0;
 			}
 			// Alloc the local-memory block:
-			sm_arr = ALLOC_UINT64(sm_arr, 0x32*max_threads);	ASSERT(HERE, sm_arr != 0x0, "ERROR: unable to allocate sm_arr!");
-			sm_ptr = (uint64*)ALIGN_UINT64(sm_arr);	ASSERT(HERE, ((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!");
+			sm_arr = ALLOC_UINT64(sm_arr, 0x32*max_threads);	ASSERT(sm_arr != 0x0, "ERROR: unable to allocate sm_arr!");
+			sm_ptr = (uint64*)ALIGN_UINT64(sm_arr);	ASSERT(((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!");
 		#ifdef MULTITHREAD
 			__r0  = (uint96 *)sm_ptr;
 			ptr64 = sm_ptr + 0x30;	// *** PTR-OFFSET IN TERMS OF UINT64 HERE ***
 			for(j = 0; j < max_threads; ++j) {
 				// These data fixed within each thread's local store:
 				*ptr64++ = ONE96.d0;	*ptr64-- = ONE96.d1;
-			//	printf("INIT: Thr %d ONE96_PTR address = %llX; data.d0,d1 = %llu,%u\n",thr_id,(uint64)ptr64,((uint96 *)ptr64)->d0,((uint96 *)ptr64)->d1);
+			//	printf("INIT: Thr %d ONE96_PTR address = %" PRIX64 "; data.d0,d1 = %" PRIu64 ",%u\n",thr_id,(uint64)ptr64,((uint96 *)ptr64)->d0,((uint96 *)ptr64)->d1);
 				ptr64 += 0x32;	// Move on to next thread's local store
 			}
 		#else
@@ -515,7 +515,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		ptr64 = ((uint64*)__r0) + thr_id*0x32;
 		qptr0  = (uint96*)(ptr64 + 0x00);	qptr1  = (uint96*)(ptr64 + 0x02);	qptr2  = (uint96*)(ptr64 + 0x04);	qptr3  = (uint96*)(ptr64 + 0x06);
 		qinv0  = (uint96*)(ptr64 + 0x08);	qinv1  = (uint96*)(ptr64 + 0x0a);	qinv2  = (uint96*)(ptr64 + 0x0c);	qinv3  = (uint96*)(ptr64 + 0x0e);
@@ -524,8 +524,8 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 		qhalf0 = (uint96*)(ptr64 + 0x20);	qhalf1 = (uint96*)(ptr64 + 0x22);	qhalf2 = (uint96*)(ptr64 + 0x24);	qhalf3 = (uint96*)(ptr64 + 0x26);
 		hi0    = (uint96*)(ptr64 + 0x28);	hi1    = (uint96*)(ptr64 + 0x2a);	hi2    = (uint96*)(ptr64 + 0x2c);	hi3    = (uint96*)(ptr64 + 0x2e);
 		ONE96_PTR = (uint96*)(ptr64 + 0x30);
-	//	printf("Thr %d ONE96_PTR address = %llX; data.d0,d1 = %llu,%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1);
-		ASSERT(HERE,(ONE96_PTR->d0 == ONE96.d0) && (ONE96_PTR->d1 == ONE96.d1), "Bad data at ONE96_PTR address!");
+	//	printf("Thr %d ONE96_PTR address = %" PRIX64 "; data.d0,d1 = %" PRIu64 ",%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1);
+		ASSERT((ONE96_PTR->d0 == ONE96.d0) && (ONE96_PTR->d1 == ONE96.d1), "Bad data at ONE96_PTR address!");
 	#endif
 
 		pshift = p + 96;
@@ -544,7 +544,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 		if(dbg)	printf("twopmodq96_q4: leadb = %u\n",leadb);
 	#endif
 
-		ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "p must be < 2^63!");
 		q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p;
 		MUL_LOHI64(q0.d0, k0, q0.d0, q0.d1);
 		MUL_LOHI64(q1.d0, k1, q1.d0, q1.d1);
@@ -570,7 +570,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 	!    Find modular inverse (mod 2^96) of q in preparation for modular multiply.
 	*/
 		/* q must be odd for Montgomery-style modmul to work: */
-		ASSERT(HERE, (q0.d0 & 1) && (q1.d0 & 1) && (q2.d0 & 1) && (q3.d0 & 1), "even modulus!");
+		ASSERT((q0.d0 & 1) && (q1.d0 & 1) && (q2.d0 & 1) && (q3.d0 & 1), "even modulus!");
 		qinv0->d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2;	qinv0->d1 = (uint64)0;
 		qinv1->d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2;	qinv1->d1 = (uint64)0;
 		qinv2->d0 = (q2.d0 + q2.d0 + q2.d0) ^ (uint64)2;	qinv2->d1 = (uint64)0;
@@ -666,7 +666,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 			if(CMPUGT96_PTR(x2, qhalf2)){ ADD96_PTR(x2, x2, x2); SUB96_PTR(x2, qptr2, x2); }else{ ADD96_PTR(x2, x2, x2); }
 			if(CMPUGT96_PTR(x3, qhalf3)){ ADD96_PTR(x3, x3, x3); SUB96_PTR(x3, qptr3, x3); }else{ ADD96_PTR(x3, x3, x3); }
 		#ifdef FAC_DEBUG
-			ASSERT(HERE, CMPULT96_PTR(x0, qptr0), "twopmodq96_q4 : CMPULT96(x0,q0)");
+			ASSERT(CMPULT96_PTR(x0, qptr0), "twopmodq96_q4 : CMPULT96(x0,q0)");
 		#endif
 		}
 
@@ -1107,10 +1107,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 		zshift <<= 1;				/* Doubling the shift count here takes cares of the first SQR_LOHI */
 		pshift = ~pshift;
 	#ifdef FAC_DEBUG
-		if(dbg)	printf("twopmodq96_q4: leadb = %u, pshift = %llu\n",leadb,pshift);
+		if(dbg)	printf("twopmodq96_q4: leadb = %u, pshift = %" PRIu64 "\n",leadb,pshift);
 	#endif
 
-		ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "p must be < 2^63!");
 		q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p;
 	#ifdef MUL_LOHI64_SUBROUTINE
 		// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -1155,7 +1155,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 		*/
 		/* q must be odd for Montgomery-style modmul to work: */
 	#ifdef FAC_DEBUG
-		ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq96_q4 : (q0.d0 & (uint64)1) == 1");
+		ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq96_q4 : (q0.d0 & (uint64)1) == 1");
 	#endif
 		qinv0.d0  = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2;	qinv0.d1  = (uint64)0;
 		qinv1.d0  = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2;	qinv1.d1  = (uint64)0;
@@ -1274,10 +1274,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 			if(CMPUGT96(x3, qhalf3)){ ADD96(x3, x3, x3); SUB96(x3, q3, x3); }else{ ADD96(x3, x3, x3); }
 		#ifdef FAC_DEBUG
 		//	if(CMPULT96(q0, x0)) { sprintf(char_buf, "twopmodq96_q4 : (x0 = %s) >= (q0 = %s)", &str0[convert_uint96_base10_char(str0, x0)], &str1[convert_uint96_base10_char(str1, q0)] );	DBG_WARN(HERE, char_buf, STATFILE, !restart); }
-			ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)");
-			ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)");
-			ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)");
-			ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)");
+			ASSERT(CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)");
+			ASSERT(CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)");
+			ASSERT(CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)");
+			ASSERT(CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)");
 			if(dbg) {
 				printf("x0 = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x0)]);
 				printf("x1 = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x1)]);
@@ -1396,10 +1396,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 			if((pshift >> j) & (uint64)1)
 			{
 			#ifdef FAC_DEBUG
-				ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)");
-				ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)");
-				ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)");
-				ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)");
+				ASSERT(CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)");
+				ASSERT(CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)");
+				ASSERT(CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)");
+				ASSERT(CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)");
 			#endif
 				/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 				if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); }
@@ -1458,10 +1458,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 			if((pshift >> j) & (uint64)1)
 			{
 			#ifdef FAC_DEBUG
-				ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)");
-				ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)");
-				ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)");
-				ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)");
+				ASSERT(CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)");
+				ASSERT(CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)");
+				ASSERT(CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)");
+				ASSERT(CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)");
 			#endif
 				if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); }
 				if(CMPUGT96(x1, qhalf1)){ ADD96(x1, x1, x1); SUB96(x1, q1, x1); }else{ ADD96(x1, x1, x1); }
@@ -1582,24 +1582,24 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 			#endif
 				fprintf(stderr, "twopmodq96_q4: Setting up for as many as %d threads...\n",max_threads);
 			#ifndef COMPILER_TYPE_GCC
-				ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
+				ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!");
 			#endif
-				ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
-				ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!");
+				ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!");
+				ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!");
 			}
 			if(sm_arr != 0x0) {	// Have previously-malloc'ed local storage (e.g. unthreaded call to the function)
 				free((void *)sm_arr);	sm_arr=0x0;
 			}
 			// Alloc the local-memory block - use uint64 allooc/align macros here, but underlying data are all uint96 = [uint64,uint32] pairs:
-			sm_arr = (uint96*)ALLOC_UINT64(sm_arr, 0x4a*max_threads);	ASSERT(HERE, sm_arr != 0x0, "ERROR: unable to allocate sm_arr!");
-			sm_ptr = (uint96*)ALIGN_UINT64(sm_arr);	ASSERT(HERE, ((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!");
+			sm_arr = (uint96*)ALLOC_UINT64(sm_arr, 0x4a*max_threads);	ASSERT(sm_arr != 0x0, "ERROR: unable to allocate sm_arr!");
+			sm_ptr = (uint96*)ALIGN_UINT64(sm_arr);	ASSERT(((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!");
 		#ifdef MULTITHREAD
 			__r0  = (uint96 *)sm_ptr;
 			ptr32 = (uint32*)(sm_ptr + 0x30);	// perm_mask ptr to permute-index register containing dwords 0-7 = [0,7,1,7,2,7,3,7]
 			for(j = 0; j < max_threads; ++j) {
 				// These data fixed within each thread's local store:
 				*ptr32 = 0;	*(ptr32+1) = 7;	*(ptr32+1) = 1;	*(ptr32+1) = 7;	*(ptr32+1) = 2;	*(ptr32+1) = 7;	*(ptr32+1) = 3;	*(ptr32+1) = 7;
-			//	printf("INIT: Thr %d perm_mask address = %llX; data.d0-7 = %llu,%u\n",thr_id,(uint64)ptr96,((uint96 *)ptr96)->d0,((uint96 *)ptr96)->d1);
+			//	printf("INIT: Thr %d perm_mask address = %" PRIX64 "; data.d0-7 = %" PRIu64 ",%u\n",thr_id,(uint64)ptr96,((uint96 *)ptr96)->d0,((uint96 *)ptr96)->d1);
 				ptr32 += 3 * 0x4a;	// Move on to next thread's local store; 3x accounts for size differntial between uint32 and uint96
 			}
 		#else
@@ -1628,7 +1628,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 
 	/* If multithreaded, set the local-store pointers needed for the current thread; */
 	#ifdef MULTITHREAD
-		ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
+		ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!");
 		ptr96 = ((uint64*)__r0) + thr_id*0x4a;
 		q0    = ptr96 + 0x00; q1    = ptr96 + 0x01; q2    = ptr96 + 0x02; q3    = ptr96 + 0x03; q4    = ptr96 + 0x04; q5    = ptr96 + 0x05; q6    = ptr96 + 0x06; q7    = ptr96 + 0x07;
 		qinv0 = ptr96 + 0x08; qinv1 = ptr96 + 0x09; qinv2 = ptr96 + 0x0a; qinv3 = ptr96 + 0x0b; qinv4 = ptr96 + 0x0c; qinv5 = ptr96 + 0x0d; qinv6 = ptr96 + 0x0e; qinv7 = ptr96 + 0x0f;
@@ -1637,7 +1637,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 		lo0   = ptr96 + 0x28; lo1   = ptr96 + 0x29; lo2   = ptr96 + 0x2a; lo3   = ptr96 + 0x23; lo4   = ptr96 + 0x2c; lo5   = ptr96 + 0x2d; lo6   = ptr96 + 0x2e; lo7   = ptr96 + 0x2f;
 		hi0   = ptr96 + 0x28; hi1   = ptr96 + 0x29; hi2   = ptr96 + 0x2a; hi3   = ptr96 + 0x2b; hi4   = ptr96 + 0x2c; hi5   = ptr96 + 0x2d; hi6   = ptr96 + 0x2e; hi7   = ptr96 + 0x2f;
 		ptr32 = perm_mask = ptr96 + 0x30;	// (0x30 * 3/2) + 2 gives 0x4a uint64 in above alloc
-		ASSERT(HERE,(*ptr32 == 0) && (*(ptr32+1) == 7) && (*(ptr32+1) == 1) && (*(ptr32+1) == 7) && (*(ptr32+1) == 2) && (*(ptr32+1) == 7) && (*(ptr32+1) == 3) && (*(ptr32+1) == 7), "Bad data at perm_mask address!");
+		ASSERT((*ptr32 == 0) && (*(ptr32+1) == 7) && (*(ptr32+1) == 1) && (*(ptr32+1) == 7) && (*(ptr32+1) == 2) && (*(ptr32+1) == 7) && (*(ptr32+1) == 3) && (*(ptr32+1) == 7), "Bad data at perm_mask address!");
 	#endif
 
 		pshift = p + 96;
@@ -1656,7 +1656,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)])
 		if(dbg)	printf("twopmodq96_q8: leadb = %u\n",leadb);
 	#endif
 
-		ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "p must be < 2^63!");
 		q0->d0 = q1->d0 = q2->d0 = q3->d0 = q4->d0 = q5->d0 = q6->d0 = q7->d0 = p+p;
 		MUL_LOHI64(q0->d0, k0, q0->d0, q0->d1);
 		MUL_LOHI64(q1->d0, k1, q1->d0, q1->d1);
@@ -2107,7 +2107,7 @@ exit(0);
 		if(dbg)printf("twopmodq96_q8:\n");
 	#endif
 
-		ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!");
+		ASSERT((p >> 63) == 0, "p must be < 2^63!");
 		q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p;
 	#ifdef MUL_LOHI64_SUBROUTINE
 		// MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1
@@ -2153,14 +2153,14 @@ exit(0);
 		*/
 		/* q must be odd for Montgomery-style modmul to work: */
 	#ifdef FAC_DEBUG
-		ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q0.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q1.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q1.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q2.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q2.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q3.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q3.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q4.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q4.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q5.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q5.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q6.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q6.d0 & (uint64)1) == 1");
-		ASSERT(HERE, (q7.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q7.d0 & (uint64)1) == 1");
+		ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q0.d0 & (uint64)1) == 1");
+		ASSERT((q1.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q1.d0 & (uint64)1) == 1");
+		ASSERT((q2.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q2.d0 & (uint64)1) == 1");
+		ASSERT((q3.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q3.d0 & (uint64)1) == 1");
+		ASSERT((q4.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q4.d0 & (uint64)1) == 1");
+		ASSERT((q5.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q5.d0 & (uint64)1) == 1");
+		ASSERT((q6.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q6.d0 & (uint64)1) == 1");
+		ASSERT((q7.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q7.d0 & (uint64)1) == 1");
 	#endif
 		qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2;	qinv0.d1 = (uint64)0;
 		qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2;	qinv1.d1 = (uint64)0;
@@ -2276,14 +2276,14 @@ exit(0);
 		if((pshift >> j) & (uint64)1)
 		{
 		#ifdef FAC_DEBUG
-			ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)");
-			ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)");
-			ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)");
-			ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)");
-			ASSERT(HERE, CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)");
-			ASSERT(HERE, CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)");
-			ASSERT(HERE, CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)");
-			ASSERT(HERE, CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)");
+			ASSERT(CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)");
+			ASSERT(CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)");
+			ASSERT(CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)");
+			ASSERT(CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)");
+			ASSERT(CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)");
+			ASSERT(CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)");
+			ASSERT(CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)");
+			ASSERT(CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)");
 		#endif
 			/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 			if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); }
@@ -2353,14 +2353,14 @@ exit(0);
 			if((pshift >> j) & (uint64)1)
 			{
 			#ifdef FAC_DEBUG
-				ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)");
-				ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)");
-				ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)");
-				ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)");
-				ASSERT(HERE, CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)");
-				ASSERT(HERE, CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)");
-				ASSERT(HERE, CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)");
-				ASSERT(HERE, CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)");
+				ASSERT(CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)");
+				ASSERT(CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)");
+				ASSERT(CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)");
+				ASSERT(CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)");
+				ASSERT(CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)");
+				ASSERT(CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)");
+				ASSERT(CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)");
+				ASSERT(CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)");
 			#endif
 				/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 				if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); }
diff --git a/src/types.h b/src/types.h
index 47ebbba1..71ba40df 100755
--- a/src/types.h
+++ b/src/types.h
@@ -28,6 +28,9 @@
 
 /* Include any needed level-0 header files: */
 #include "platform.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include <inttypes.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,10 +41,10 @@ extern "C" {
 /*...useful utility parameters */
 
 #undef TRUE
-#define TRUE	1
+#define TRUE	true
 
 #undef FALSE
-#define FALSE	0
+#define FALSE	false
 
 /* Basic integer types - we assume char/short/int mean 8/16/32 bits, respectively,
 but this assumption gets checked at the start of program execution,
@@ -63,55 +66,22 @@ so we're not flying blind:
 #undef	sint64
 #undef	uint64
 
-#undef	 int64c
-#undef	sint64c
-#undef	uint64c
+typedef          int8_t		 int8;
+typedef          int8_t		sint8;
+typedef uint8_t			uint8;
 
-typedef          char		 int8;
-typedef          char		sint8;
-typedef unsigned char		uint8;
+typedef          int16_t	 int16;
+typedef          int16_t	sint16;
+typedef uint16_t		uint16;
 
-typedef          short		 int16;
-typedef          short		sint16;
-typedef unsigned short		uint16;
+typedef          int32_t	 int32;
+typedef          int32_t	sint32;
+typedef uint32_t		uint32;
 
-typedef          int		 int32;
-typedef          int		sint32;
-typedef unsigned int		uint32;
+typedef          int64_t	 int64;
+typedef          int64_t	sint64;
+typedef uint64_t		uint64;
 
-/* 64-bit int: */
-/* MSVC doesn't like 'long long', and of course MS has their own
-completely non-portable substitute:
-*/
-#if(defined(OS_TYPE_WINDOWS) && defined(COMPILER_TYPE_MSVC))
-	typedef   signed __int64	 int64;
-	typedef   signed __int64	sint64;
-	typedef unsigned __int64	uint64;
-	typedef const  signed __int64	 int64c;
-	typedef const  signed __int64	sint64c;
-	typedef const unsigned __int64	uint64c;
-
-	/* GW: In many cases where the C code is interfacing with the assembly code */
-	/* we must declare variables that are exactly 32-bits wide.  This is the */
-	/* portable way to do this, as the linux x86-64 C compiler defines the */
-	/* long data type as 64 bits.  We also use portable definitions for */
-	/* values that can be either an integer or a pointer. */
-	#if OS_BITS == 64
-		typedef  int64		intptr_t;
-		typedef uint64		uintptr_t;
-	#else
-		typedef  int32		intptr_t;
-		typedef uint32		uintptr_t;
-	#endif
-
-#else
-	typedef          long long	 int64;
-	typedef          long long	sint64;
-	typedef unsigned long long	uint64;
-	typedef const          long long	 int64c;
-	typedef const          long long	sint64c;
-	typedef const unsigned long long	uint64c;
-#endif
 /*
 #ifdef int32_t
 	#warning int32_t already defined!
diff --git a/src/util.c b/src/util.c
index 9e8fe146..dbf189b1 100644
--- a/src/util.c
+++ b/src/util.c
@@ -79,34 +79,19 @@ void WARN(long line, char*file, char*warn_string, char*warn_file, int copy2stder
 	__device__ void ASSERT(long line, char*file, int expr, char*assert_string) {}
 #else
 
-  #ifdef USE_C99
-
-	void ASSERT(char*func, long line, char*file, int expr, char*assert_string) {
-		/* Define a convenient spot to set a breakpoint: */
-		if(!expr) {
-			fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file);	fprintf(stderr,"Assertion failed: %s\n", assert_string);
-			/* Flush all output streams prior to asserting. We replace the original assert(0) call with
-			an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
-			fflush(NULL);
-			exit(EXIT_FAILURE);
-		}
-	}
-
-  #else
-
-	void ASSERT(long line, char*file, int expr, char*assert_string) {
+	// void ASSERT(char*func, long line, char*file, int expr, char*assert_string) {
+	void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string) {
 		/* Define a convenient spot to set a breakpoint: */
 		if(!expr) {
-			fprintf(stderr,"ERROR: at line %lu of file %s\n", line, file);	fprintf(stderr,"Assertion failed: %s\n", assert_string);
+			fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file);	fprintf(stderr,"Assertion '%s' failed: %s\n", assertion, assert_string);
 			/* Flush all output streams prior to asserting. We replace the original assert(0) call with
 			an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */
 			fflush(NULL);
-			exit(EXIT_FAILURE);	// Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing
+			// exit(EXIT_FAILURE);	// Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing
+			abort();
 		}
 	}
 
-  #endif
-
 #endif	// __CUDA_ARCH__ ?
 
 /***************/
@@ -136,7 +121,7 @@ void	VAR_WARN(char *typelist, ...)
 				dval = va_arg(varargs,double);
 				break;
 			default :
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 				break;
 		}
 	}
@@ -216,7 +201,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		y = (uint64 *)calloc(lenX + 1, sizeof(uint64));
 		// 10^100 has 333 bits, thus needs 6 uint64s, as do the mod-10^100 remainders,
 		// but we allow the convert_base10_char_mi64() utility to do the allocation of the former for us:
-		lenD = 0; ASSERT(HERE, 0x0 != (d = convert_base10_char_mi64("1000000000000000000000000000", &lenD)) && (lenD == 2), "0");
+		lenD = 0; ASSERT(0x0 != (d = convert_base10_char_mi64("1000000000000000000000000000", &lenD)) && (lenD == 2), "0");
 		r = (uint64 *)calloc(lenD, sizeof(uint64));
 		nc -= 28;		// starting char of first 27-digit chunk
 		for(i = 0; ; i+=2) {	// i = #divides counter; do 2 divs per loop exec in attempt to get some modest pipelining
@@ -240,7 +225,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		nc = nchars + (nchars/27) + 1;	// Add newlines to count
 		str[nc-1] = '\0';
 		fp = mlucas_fopen(fname, "w");
-		ASSERT(HERE, fp != 0x0, "Null file pointer!");
+		ASSERT(fp != 0x0, "Null file pointer!");
 		fprintf(fp,"%s\n", str);
 		fclose(fp);	fp = 0x0;
 		fprintf(stderr,"Done writing %s.",fname);
@@ -391,7 +376,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			for(i = 0; i < 8; ++i) {
 				// It's a PRP: check vs table of known pseudoprimes and (if it's not a PSP) init for the next PSP:
 				if((itmp32 >> i)&0x1) {
-					ASSERT(HERE, curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve");
+					ASSERT(curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve");
 					if((curr_p + pdsum_8[i]) == fbase2psp[fbase2psp_idx]) {	// It's a base-2 pseudoprime
 						++fbase2psp_idx;
 						continue;
@@ -549,7 +534,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			for(j = 0; j < 8; ++j) {
 				if((itmp32 >> j)&0x1)	// It's a PRP, so check against the table of known pseudoprimes and
 				{						// (if it's not a PSP) init for the next gap
-					ASSERT(HERE, curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve");
+					ASSERT(curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve");
 					if((curr_p + pdsum_8[j]) == fbase2psp[fbase2psp_idx]) {	/* It's a base-2 pseudoprime */
 						++fbase2psp_idx;
 						pdiff[i] += pdiff_8[j];
@@ -574,8 +559,8 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		printf("Using first %u odd primes; max gap = %u\n",nprime,2*max_diff);
 		printf("max sieving prime = %u\n",ihi);
 
-		ASSERT(HERE, p > thresh, "Mersenne prime exponent must be larger that allowable threshold!");
-		ASSERT(HERE, twopmodq32(p-1, p) == 1, "p fails base-2 fprp test!");
+		ASSERT(p > thresh, "Mersenne prime exponent must be larger that allowable threshold!");
+		ASSERT(twopmodq32(p-1, p) == 1, "p fails base-2 fprp test!");
 		np = 0;	// #primes in the current p-centered cohort
 		// find N primes < and > p, compute smoothness norm based on p-1 factorization for each, store each [p,snorm] pair
 		fbase2psp_idx = 0;	// Index to next-expected Fermat base-2 pseudoprime in the precomputed table
@@ -775,7 +760,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			//	printf("I = %d: x = %f; y = %f; hi,lo = %f,%f\n",i, h_A[i],h_B[i],h_D[i],h_C[i]);
 				if(cmp_fma_lohi_vs_exact(h_A[i],h_B[i],h_D[i],h_C[i], iax,iay,iahi,ialo)) {
 					printf("ERROR: pow2 = %d, I = %d, outputs differ!\n",pow2,i);
-					ASSERT(HERE, 0, "fma_dmult tests failed!");
+					ASSERT(0, "fma_dmult tests failed!");
 				}
 			}	// i-loop
 			pow2_dmult *= 2;
@@ -818,9 +803,9 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			} else {	// Fill in any remaining slots with 63-bit test data. of which we know we have > (1<<10):
 				p = fac63[i-nelt64].p;
 				q = fac63[i-nelt64].q;
-	//if((i-nelt64) < 10)printf("p[%3d] = %u: q = %llu ... ",i, p, q);
+	//if((i-nelt64) < 10)printf("p[%3d] = %u: q = %" PRIu64 " ... ",i, p, q);
 			}
-			ASSERT(HERE, p != 0, "p must be nonzero!");
+			ASSERT(p != 0, "p must be nonzero!");
 			// Compute auxiliary TF data:
 			pshift = p + 64;
 			jshift = leadz64(pshift);
@@ -837,10 +822,10 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			dbl /= (2.0*p);
 			rnd = DNINT(dbl);
 			k = (uint64)rnd;
-			ASSERT(HERE, k*(p<<1)+1 == q, "k computed incorrectly!");
+			ASSERT(k*(p<<1)+1 == q, "k computed incorrectly!");
 			*(h_p     + i) = p          ;	*(h_pshft + i) = pshift     ;	*(h_k + i) = k;
 			*(h_zshft + i) = zshift     ;	*(h_stidx + i) = start_index;
-		//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",i, p, pshift, zshift, start_index, k);
+		//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %" PRIu64 "\n",i, p, pshift, zshift, start_index, k);
 		}
 		printf("Testing %d = %d 64-bit and %d 63-bit known-factors...",N,nelt64,N-nelt64);
 
@@ -849,7 +834,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		for(i = 0; i < N; ++i) {
 			*(h_B+i) = 0;
 		}
-	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
+	//	printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
 		// Allocate vectors in device memory
 		uint64 *d_p,*d_pshft,*d_k;
 		uint32 *d_zshft,*d_stidx;
@@ -886,8 +871,8 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			j = (uint32)twopmodq64((uint64)p, q);
 			if((j != 1) || (*(h_B + i) != 1)) {
 				printf("cudaVecModpowTest64: Mismatch between Ref and GPU result:\n");
-				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,q) = %u, %llu\n", i,*(h_B + i), j,p,q);
-				ASSERT(HERE, 0, "cudaVecModpowTest64 failed!");
+				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,q) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,q);
+				ASSERT(0, "cudaVecModpowTest64 failed!");
 			}
 		}
 		printf("cudaVecModpowTest64 with %d test (p,q) pairs succeeded!\n",N);
@@ -933,7 +918,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		for(i = 0; i < N; ++i) {
 			*(h_B+i) = 0;
 		}
-	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
+	//	printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
 		// Allocate vectors in device memory
 		uint64 *d_p,*d_pshft,*d_k;
 		uint32 *d_zshft,*d_stidx;
@@ -967,13 +952,13 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 
 		// Reference computation:
 		j = (uint32)twopmodq78_3WORD_DOUBLE((uint64)p, k);
-		ASSERT(HERE, (j == 1), "cudaVecModpowTest78_0 ref-comp failed!");
+		ASSERT((j == 1), "cudaVecModpowTest78_0 ref-comp failed!");
 		// Test GPU results:
 		for(i = 0; i < N; ++i) {
 			if(*(h_B + i) != 1) {
 				printf("cudaVecModpowTest78_0: Mismatch between Ref and GPU result:\n");
-				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k);
-				ASSERT(HERE, *(h_B + i) == 1, "cudaVecModpowTest78_0 failed!");
+				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,k);
+				ASSERT(*(h_B + i) == 1, "cudaVecModpowTest78_0 failed!");
 			}
 		}
 		printf("cudaVecModpowTest78_0 with %d test (p,q) pairs succeeded!\n",N);
@@ -1024,7 +1009,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			k = (uint64)rnd;
 			*(h_p     + nelts) = p          ;	*(h_pshft + nelts) = pshift     ;	*(h_k + nelts) = k;
 			*(h_zshft + nelts) = zshift     ;	*(h_stidx + nelts) = start_index;
-	//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",nelts, p, pshift, zshift, start_index, k);
+	//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %" PRIu64 "\n",nelts, p, pshift, zshift, start_index, k);
 			++nelts;
 		}
 		printf("Testing %d 78-bit known-factors...",nelts);
@@ -1056,7 +1041,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		for(i = 0; i < N; ++i) {
 			*(h_B+i) = 0;
 		}
-	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
+	//	printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
 		// Allocate vectors in device memory
 		uint64 *d_p,*d_pshft,*d_k;
 		uint32 *d_zshft,*d_stidx;
@@ -1094,8 +1079,8 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			j = (uint32)twopmodq78_3WORD_DOUBLE((uint64)p, k);
 			if((j != 1) || (*(h_B + i) != 1)) {
 				printf("cudaVecModpowTest78: Mismatch between Ref and GPU result:\n");
-				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k);
-				ASSERT(HERE, 0, "cudaVecModpowTest78 failed!");
+				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,k);
+				ASSERT(0, "cudaVecModpowTest78 failed!");
 			}
 		}
 		printf("cudaVecModpowTest78 with %d test (p,q) pairs succeeded!\n",nelts);
@@ -1160,12 +1145,12 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			k = x96.d0;
 			// Skip any (p,q) pair for which the k > 2^64:
 			if(x96.d1 != 0) {	// x128 holds k
-			//	printf("Warning: k > 2^64 detected for (p,q) = %u,[%u*2^64 + %llu] ... skipping this datum.\n",p,q96.d1,q96.d0);
+			//	printf("Warning: k > 2^64 detected for (p,q) = %u,[%u*2^64 + %" PRIu64 "] ... skipping this datum.\n",p,q96.d1,q96.d0);
 				continue;
 			}
 			*(h_p     + nelts) = p          ;	*(h_pshft + nelts) = pshift     ;	*(h_k + nelts) = k;
 			*(h_zshft + nelts) = zshift     ;	*(h_stidx + nelts) = start_index;
-	//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",nelts, p, pshift, zshift, start_index, k);
+	//	printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %" PRIu64 "\n",nelts, p, pshift, zshift, start_index, k);
 			++nelts;
 		}
 		printf("Testing %d 96-bit known-factors...",nelts);
@@ -1197,7 +1182,7 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 		for(i = 0; i < N; ++i) {
 			*(h_B+i) = 0;
 		}
-	//	printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
+	//	printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index);
 		// Allocate vectors in device memory
 		uint64 *d_p,*d_pshft,*d_k;
 		uint32 *d_zshft,*d_stidx;
@@ -1236,8 +1221,8 @@ void	ui64_bitstr(const uint64 ui64, char*ostr)
 			j = (q96.d1 == 0) && (q96.d0 == 1);
 			if((j != 1) || (*(h_B + i) != 1)) {
 				printf("cudaVecModpowTest96: Mismatch between Ref and GPU result:\n");
-				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k);
-				ASSERT(HERE, 0, "cudaVecModpowTest96 failed!");
+				printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,k);
+				ASSERT(0, "cudaVecModpowTest96 failed!");
 			}
 		}
 		printf("cudaVecModpowTest96 with %d test (p,q) pairs succeeded!\n",nelts);
@@ -1452,7 +1437,7 @@ void host_init(void)
 	TWO25FLOAT = (double)0x02000000;				TWO25FLINV = 1.0/TWO25FLOAT;
 	TWO26FLOAT = (double)0x04000000;				TWO26FLINV = 1.0/TWO26FLOAT;
 	dbl = qfdbl(qfmul_pow2(QONE, -26));
-	ASSERT(HERE, TWO26FLINV == dbl, "TWO26FLINV!");
+	ASSERT(TWO26FLINV == dbl, "TWO26FLINV!");
 
 	TWO13FLINV = qfdbl(qfmul_pow2(QONE, -13));
 
@@ -1477,8 +1462,8 @@ void host_init(void)
 	qtest();	// 09/23/2012: Move to after above float-consts-inits because of the qfloat/mi64 routines which use those consts.
 
 	/* Use qfloat routines to set the global floating-point constant 1/sqrt(2): */
-	ASSERT(HERE, ISRT2 == qfdbl(QISRT2), "1/sqrt2 precision check failed!");
-	ASSERT(HERE, SQRT2 == qfdbl(QSQRT2), "  sqrt2 precision check failed!");
+	ASSERT(ISRT2 == qfdbl(QISRT2), "1/sqrt2 precision check failed!");
+	ASSERT(SQRT2 == qfdbl(QSQRT2), "  sqrt2 precision check failed!");
 
 #ifdef CPU_IS_X86	// May 2018: It seems I only found need to call this runtime CPU-mode setting in 32-bit x86 mode, not 64-bit. But had occasion
 					// to fiddle w/rnd-mode in some x86_64 tests, so changed things so that the function is *defined* in both 32 and 64-bit modes.
@@ -1490,13 +1475,13 @@ void host_init(void)
 
 	// Test wide-mul routines:
 	printf("INFO: testing IMUL routines...\n");
-	ASSERT(HERE, test_mul() == 0, "test_mul() returns nonzero!");
+	ASSERT(test_mul() == 0, "test_mul() returns nonzero!");
 
 	// Test the 64-bit 2^[+|-]p (mod q) functions:
 	uint32 imax = 100000;
 	fprintf(stderr,"INFO: Testing 64-bit 2^p (mod q) functions with %u random (p, q odd) pairs...\n",imax);
 	clock1 = clock();
-	ASSERT(HERE, test_twopmodq64(imax) == 0, "test_twopmodq64() returns nonzero!");
+	ASSERT(test_twopmodq64(imax) == 0, "test_twopmodq64() returns nonzero!");
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 //	printf("Time for %u 2^[+|-]p (mod q) call pairs =%s\n",imax, get_time_str(tdiff));
@@ -1521,10 +1506,10 @@ void host_init(void)
 	uint64 vec[max_test_dim], exp;	// Use a known M-prime exponent and dimension vec suitably
 	const uint32 mers_expos[] = {61,89,107,127,521,607,1279,2203,2281,3217,4253,4423,9689,9941,11213,19937,21701,0x0};
 	for(i = 0, exp = (uint64)mers_expos[i]; exp != 0; i++) {
-		fprintf(stderr,"TEST_MI64_PRP: Base-3 Fermat-PRP test of M(%llu)...\n",exp);
-		ASSERT(HERE, exp < (max_test_dim<<6), "Bignum-PRP test exponent larger than test-vec dimension permits!");
+		fprintf(stderr,"TEST_MI64_PRP: Base-3 Fermat-PRP test of M(%" PRIu64 ")...\n",exp);
+		ASSERT(exp < (max_test_dim<<6), "Bignum-PRP test exponent larger than test-vec dimension permits!");
 		j = mi64_init_mers_or_ferm_modulus(exp, 0, vec);
-		ASSERT(HERE, mi64_pprimeF(vec,3ull,j), "TEST_MI64_PRP: Base-3 Fermat-PRP test fails!");
+		ASSERT(mi64_pprimeF(vec,3ull,j), "TEST_MI64_PRP: Base-3 Fermat-PRP test fails!");
 	}
 	exit(0);
 #endif
@@ -1534,21 +1519,21 @@ void host_init(void)
 	printf("INFO: Timing-testing selected FFT macros...\n");
 
   #if defined(USE_SSE2) && !defined(USE_AVX)	// 4-DFT is SSE2-only
-//	ASSERT(HERE, test_radix4_dft() == 0, "test_radix4_dft() returns nonzero!");
+//	ASSERT(test_radix4_dft() == 0, "test_radix4_dft() returns nonzero!");
   #endif
 
-//	ASSERT(HERE, test_radix16_dft() == 0, "test_radix16_dft() returns nonzero!");
+//	ASSERT(test_radix16_dft() == 0, "test_radix16_dft() returns nonzero!");
 
 	#include "radix32_dif_dit_pass_asm.h"	// Commenting this out gives compile error
-//	ASSERT(HERE, test_radix32_dft() == 0, "test_radix32_dft() returns nonzero!");
+//	ASSERT(test_radix32_dft() == 0, "test_radix32_dft() returns nonzero!");
 
   #ifdef USE_AVX
 	test_vperm2f128();	// Is one designed for step-thru debug
 exit(0);
-//	ASSERT(HERE, test_simd_transpose_4x4() == 0, "test_simd_transpose_4x4() returns nonzero!");
+//	ASSERT(test_simd_transpose_4x4() == 0, "test_simd_transpose_4x4() returns nonzero!");
   #endif
   #ifdef USE_AVX512
-	ASSERT(HERE, test_simd_transpose_8x8() == 0, "test_simd_transpose_8x8() returns nonzero!");
+	ASSERT(test_simd_transpose_8x8() == 0, "test_simd_transpose_8x8() returns nonzero!");
 exit(0);
   #endif
 #endif
@@ -1557,23 +1542,23 @@ exit(0);
 #if INCLUDE_GMP && 0
 	uint32 m = 33;	// 7 Sep 2021: GMP gcd on Haswell quad needs 24|54 min for F31|32-sized inputs; insufficient RAM (8 GB) for F33
 					// On KNL with 16GB MCDRAM, need ??|??|?? min for F31|32|33, with F30 running on cores 0-63 and GIMPS-DC on 64-67.
-	ASSERT(HERE, m < 64, "Fermat-number index must be < 64!");
+	ASSERT(m < 64, "Fermat-number index must be < 64!");
 	printf("INFO: testing GCD routines on F%u-sized inputs\n",m);
 	// Apr 2021: check known factor of F31 using both mi64_div and GMP gcd, to get timing on the latter:
 	rng_isaac_init(TRUE);
 	uint64 rem[2] = {0ull,0ull}, q[2] = {3118754346955702273ull,2544ull};	// Known factor of F31: k = 3.13.140091319777; q = k.2^(m+2) + 1
 	int i,isfact,nlimb = (1<<(m-6)) + 1;	// # of 64-bit limbs in Fm, which has 2^m+1 bits, thus needs one extra limb for the high 1-bit
 	// vec0 is used for scratch storage, since mi64_mul_vector() does not permit in-place operation:
-	uint64*vec0 = calloc(nlimb,sizeof(uint64));	ASSERT(HERE, vec0 != NULL, "vec0[]-array alloc failed!");
-	uint64*vec1 = calloc(nlimb,sizeof(uint64));	ASSERT(HERE, vec1 != NULL, "vec1[]-array alloc failed!");
-	uint64*vec2 = calloc(nlimb,sizeof(uint64));	ASSERT(HERE, vec2 != NULL, "vec2[]-array alloc failed!");
+	uint64*vec0 = calloc(nlimb,sizeof(uint64));	ASSERT(vec0 != NULL, "vec0[]-array alloc failed!");
+	uint64*vec1 = calloc(nlimb,sizeof(uint64));	ASSERT(vec1 != NULL, "vec1[]-array alloc failed!");
+	uint64*vec2 = calloc(nlimb,sizeof(uint64));	ASSERT(vec2 != NULL, "vec2[]-array alloc failed!");
 	// Init 2 random (mlimb-1)-length multiples of q:
 	for(i = 0; i < nlimb-2; i++) { vec0[i] = rng_isaac_rand(); vec1[i] = rng_isaac_rand(); }
 	// i holds product length on return:
-	mi64_mul_vector(vec1,nlimb-2, q,2, vec2,&i);	ASSERT(HERE, i == nlimb, "Bad product length in gcd-test init!");
-	mi64_mul_vector(vec0,nlimb-2, q,2, vec1,&i);	ASSERT(HERE, i == nlimb, "Bad product length in gcd-test init!");
-	isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem);	ASSERT(HERE, isfact != 0, "mi64_div failed to find target factor!");
-	isfact = mi64_div(vec2,q, nlimb,2, 0x0, rem);	ASSERT(HERE, isfact != 0, "mi64_div failed to find target factor!");
+	mi64_mul_vector(vec1,nlimb-2, q,2, vec2,&i);	ASSERT(i == nlimb, "Bad product length in gcd-test init!");
+	mi64_mul_vector(vec0,nlimb-2, q,2, vec1,&i);	ASSERT(i == nlimb, "Bad product length in gcd-test init!");
+	isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem);	ASSERT(isfact != 0, "mi64_div failed to find target factor!");
+	isfact = mi64_div(vec2,q, nlimb,2, 0x0, rem);	ASSERT(isfact != 0, "mi64_div failed to find target factor!");
 	// Now feed our two random-multiple vectors to GMP gcd:
 	char gcd_str[STR_MAX_LEN];
 	isfact = gcd(0,0ull,vec1,vec2,nlimb,gcd_str);	// 1st arg = stage just completed
@@ -1650,7 +1635,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u rng64 calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i64 != 0ull,"rng64 sum = 0!");
+	ASSERT(i64 != 0ull,"rng64 sum = 0!");
 
 	clock1 = clock();
 	i32 = 0;
@@ -1665,7 +1650,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u [rng64 + 4*popcount32()] calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i32,"popcount32 sum = 0!");
+	ASSERT(i32,"popcount32 sum = 0!");
 
 	clock1 = clock();
 	i32 = 0;
@@ -1679,7 +1664,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u [rng64 + 4*popcount64()] calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i32,"popcount64 sum = 0!");
+	ASSERT(i32,"popcount64 sum = 0!");
 
 	clock1 = clock();
 	i32 = 0;
@@ -1694,7 +1679,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u [rng64 + 4*leadz32()] calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i32,"leadz32 sum = 0!");
+	ASSERT(i32,"leadz32 sum = 0!");
 
 	clock1 = clock();
 	i32 = 0;
@@ -1708,7 +1693,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u [rng64 + 4*leadz64()] calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i32,"leadz64 sum = 0!");
+	ASSERT(i32,"leadz64 sum = 0!");
 
 	clock1 = clock();
 	i32 = 0;
@@ -1724,7 +1709,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u [rng64 + 4*trailz32()] calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i32,"trailz32 sum = 0!");
+	ASSERT(i32,"trailz32 sum = 0!");
 
 	clock1 = clock();
 	i32 = 0;
@@ -1738,7 +1723,7 @@ exit(0);
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
 	printf("Time for %u [rng64 + 4*trailz64()] calls =%s\n",imax, get_time_str(tdiff));
-	ASSERT(HERE, i32,"trailz64 sum = 0!");
+	ASSERT(i32,"trailz64 sum = 0!");
 exit(0);
 	clock1 = clock();
 	for(i = 0; i < imax; i++) {
@@ -1747,11 +1732,11 @@ exit(0);
 		x32 = (uint32)i64;
 		int ii = ith_set_bit32(x32,bit);
 		if(popcount32(x32) < bit)
-			ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!");
+			ASSERT(ii == -1, "[bit]th-bit specifier out of range!");
 		else {
 			uint32 tmp32 = x32 << (31-ii);
-			ASSERT(HERE, tmp32 & 0x80000000,"ith_set_bit64 retval not actually set!");
-			ASSERT(HERE, popcount32(tmp32) == bit, "ith_set_bit32 checksum fail!");
+			ASSERT(tmp32 & 0x80000000,"ith_set_bit64 retval not actually set!");
+			ASSERT(popcount32(tmp32) == bit, "ith_set_bit32 checksum fail!");
 		}
 	}
 	clock2 = clock();
@@ -1764,12 +1749,12 @@ exit(0);
 		bit = (i64>>32) & 0x3f;	if(!bit) continue;
 		int ii = ith_set_bit64(i64,bit);
 		if(popcount64(i64) < bit)
-			ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!");
+			ASSERT(ii == -1, "[bit]th-bit specifier out of range!");
 		else {
 			uint64 tmp64 = i64 << (63-ii);
 			// Must cast result of AND to 32-bit here (via compare-vs-0) since ASSERT (expr) is 32-bit:
-			ASSERT(HERE, (tmp64 & 0x8000000000000000ull) != 0,"ith_set_bit64 retval not actually set!");
-			ASSERT(HERE, popcount64(tmp64) == bit, "ith_set_bit64 checksum fail!");
+			ASSERT((tmp64 & 0x8000000000000000ull) != 0,"ith_set_bit64 retval not actually set!");
+			ASSERT(popcount64(tmp64) == bit, "ith_set_bit64 checksum fail!");
 		}
 	}
 	clock2 = clock();
@@ -1786,12 +1771,12 @@ exit(0);
 		bit = (iarr[0]>>32) & 0xff;	if(!bit) continue;
 		int ii = mi64_ith_set_bit(iarr,bit,4);
 		if(mi64_popcount(iarr,4) < bit)
-			ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!");
+			ASSERT(ii == -1, "[bit]th-bit specifier out of range!");
 		else {
 			mi64_shl(iarr,iarr,(255-ii),4);
 			// Must cast result of AND to 32-bit here (via compare-vs-0) since ASSERT (expr) is 32-bit:
-			ASSERT(HERE, (iarr[3] & 0x8000000000000000ull) != 0,"mi64_ith_set_bit64 retval not actually set!");
-			ASSERT(HERE, mi64_popcount(iarr,4) == bit, "mi64_ith_set_bit64 checksum fail!");
+			ASSERT((iarr[3] & 0x8000000000000000ull) != 0,"mi64_ith_set_bit64 retval not actually set!");
+			ASSERT(mi64_popcount(iarr,4) == bit, "mi64_ith_set_bit64 checksum fail!");
 		}
 	}
 	clock2 = clock();
@@ -1804,10 +1789,10 @@ exit(0);
 	int i;
 	const int n = 1000, iters = 1000000;
 	// Allocate the main data arrays, require these to be on 16-byte boundaries to enable SSE2-based addsub:
-	uint64 *u = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)u & 0xf) == 0, "u not 16-byte aligned!");
-	uint64 *v = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)v & 0xf) == 0, "u not 16-byte aligned!");
-	uint64 *x = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)x & 0xf) == 0, "u not 16-byte aligned!");
-	uint64 *y = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(HERE, ((uint32)y & 0xf) == 0, "u not 16-byte aligned!");
+	uint64 *u = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(((uint32)u & 0xf) == 0, "u not 16-byte aligned!");
+	uint64 *v = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(((uint32)v & 0xf) == 0, "u not 16-byte aligned!");
+	uint64 *x = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(((uint32)x & 0xf) == 0, "u not 16-byte aligned!");
+	uint64 *y = (uint64 *)calloc(n, sizeof(uint64));	ASSERT(((uint32)y & 0xf) == 0, "u not 16-byte aligned!");
 
 	/* Init the RNG and the inputs: */
 	rng_isaac_init(TRUE);
@@ -1821,14 +1806,14 @@ exit(0);
 	uint64 cy1 = mi64_add(u,v,x,n);
 	uint64 cy2 = mi64_add_ref(u,v,y,n);
 	if(cy1 != cy2) {
-		printf("Carryout mismatch: cy1 = %llu, cy2 = %llu\n",cy1,cy2);
-	//	ASSERT(HERE, 0, "Incorrect mi64_add carryout");	// GCC 4.4.5 builds on my SB give carry-mismatch here ... wtf?
+		printf("Carryout mismatch: cy1 = %" PRIu64 ", cy2 = %" PRIu64 "\n",cy1,cy2);
+	//	ASSERT(0, "Incorrect mi64_add carryout");	// GCC 4.4.5 builds on my SB give carry-mismatch here ... wtf?
 	}
 	for(i = 0; i < n; i++)
 	{
 		if(x[i] != y[i]) {
-			printf("Output mismatch: x[%d] = %llu, y[%d] = %llu\n",i,x[i],i,y[i]);
-			ASSERT(HERE, 0, "Incorrect mi64_add output element");
+			printf("Output mismatch: x[%d] = %" PRIu64 ", y[%d] = %" PRIu64 "\n",i,x[i],i,y[i]);
+			ASSERT(0, "Incorrect mi64_add output element");
 		}
 	}
 
@@ -1840,7 +1825,7 @@ exit(0);
 	}
 	clock2 = clock();
 	tdiff = (double)(clock2 - clock1);
-	printf	("mi64_add: Time for %llu limbs =%s\n",(uint64)iters*n, get_time_str(tdiff));
+	printf	("mi64_add: Time for %" PRIu64 " limbs =%s\n",(uint64)iters*n, get_time_str(tdiff));
 	exit(0);
 #endif
 	/************************************************************/
@@ -1871,7 +1856,7 @@ exit(0);
 	#error Unrecognized multithreading model!
   #endif
 	// MAX_THREADS based on number of processing cores will most often be a power of 2, but don't assume that.
-	ASSERT(HERE, MAX_THREADS > 0,"Mlucas.c: MAX_THREADS must be > 0");
+	ASSERT(MAX_THREADS > 0,"Mlucas.c: MAX_THREADS must be > 0");
 
 	printf("INFO: System has %d available processor cores.\n", MAX_THREADS);
 
@@ -1880,7 +1865,7 @@ exit(0);
 	ncpu = MAX_THREADS;
 	printf("INFO: Testing Multithreading support with %d threads...\n", ncpu);
 	// Toggle boolean 2nd arg here to enable verbose mode:
-	ASSERT(HERE, test_pthreads(nthr,FALSE) == 0, "test_pthreads() returns nonzero!");
+	ASSERT(test_pthreads(nthr,FALSE) == 0, "test_pthreads() returns nonzero!");
   #endif
 #endif
 
@@ -1921,7 +1906,7 @@ void set_stacklimit_restart(char *argv[])
 
 	if (getrlimit(RLIMIT_STACK, &stack_limits)) {
 		fprintf(stderr, "Call to getrlimit() failed.\n");
-		ASSERT(HERE, 0, "Exiting.");
+		ASSERT(0, "Exiting.");
 	}
 	printf("Old stack_limits: cur = %zu, max = %zu, [RLIM_INFINITY = %zu]\n",
 	       stack_limits.rlim_cur, stack_limits.rlim_max, RLIM_INFINITY);
@@ -1932,14 +1917,14 @@ void set_stacklimit_restart(char *argv[])
 
 	if (setrlimit(RLIMIT_STACK, &stack_limits)) {
 		fprintf(stderr, "Call to setrlimit() failed.\n");
-		ASSERT(HERE, 0, "Exiting.");
+		ASSERT(0, "Exiting.");
 	}
 	printf("New stack_limits: cur = %zu, max = %zu\n",
 	       stack_limits.rlim_cur, stack_limits.rlim_max);
 
 	if(execvp(argv[0], argv)) {
 		fprintf(stderr, "Call to execvp() failed.\n");
-		ASSERT(HERE, 0, "Exiting.");
+		ASSERT(0, "Exiting.");
 	}
 #endif /* CPU_IS_X86  */
 }
@@ -1979,7 +1964,7 @@ uint32 get_system_ram(void) {
 	MEMORYSTATUSEX memInfo;
 	memInfo.dwLength = sizeof(memInfo);
 	GlobalMemoryStatusEx(&memInfo);
-	fprintf(stderr, "System total RAM = %llu, free RAM = %llu\n", memInfo.ullTotalPhys>>20, memInfo.ullAvailPhys>>20);
+	fprintf(stderr, "System total RAM = %" PRIu64 ", free RAM = %" PRIu64 "\n", memInfo.ullTotalPhys>>20, memInfo.ullAvailPhys>>20);
 	return memInfo.ullAvailPhys>>20;
 
 #elif defined(OS_TYPE_MACOSX)
@@ -2017,7 +2002,7 @@ uint32 get_system_ram(void) {
 	{
 		char in_line[STR_MAX_LEN];
 		FILE*fp = mlucas_fopen("/proc/cpuinfo", "r");
-		ASSERT(HERE, fp != 0x0, "/proc/cpuinfo file not found!");
+		ASSERT(fp != 0x0, "/proc/cpuinfo file not found!");
 		while(fgets(in_line, STR_MAX_LEN, fp) != 0x0) {
 			if(strstr(in_line, "asimd") != 0)
 				return 1;
@@ -2106,7 +2091,7 @@ void print_host_info(void)
 	if(cudaError != cudaSuccess)
 	{
 		printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError));
-		ASSERT(HERE, 0, "gpu_sieve: GPU-side error detected!");
+		ASSERT(0, "gpu_sieve: GPU-side error detected!");
 	}
 
 //	cudaVecAddTest();
@@ -2131,7 +2116,7 @@ void print_host_info(void)
 	char hwloc_version[12];
 	snprintf(hwloc_version,sizeof(hwloc_version),"%u.%u.%u",HWLOC_API_VERSION>>16,(HWLOC_API_VERSION>>8)&0xff,HWLOC_API_VERSION&0xff);
 	printf("HWLOC Version = %s; \n",hwloc_version);
-	ASSERT(HERE,hw_topology != 0x0,"HWLOC hardware topology object not initialized!");
+	ASSERT(hw_topology != 0x0,"HWLOC hardware topology object not initialized!");
 	int topodepth = hwloc_topology_get_depth(hw_topology);
 	uint32 nsock = hwloc_get_nbobjs_by_type(hw_topology, HWLOC_OBJ_PACKAGE);
 	uint32 ncore = hwloc_get_nbobjs_by_type(hw_topology, HWLOC_OBJ_CORE);
@@ -2149,7 +2134,7 @@ void print_host_info(void)
 	#endif
 	} else {
 	#ifdef USE_ARM_V8_SIMD
-		ASSERT(HERE, 0, "#define USE_ARM_V8_SIMD invoked but no advanced-SIMD support detected on this CPU!\n");
+		ASSERT(0, "#define USE_ARM_V8_SIMD invoked but no advanced-SIMD support detected on this CPU!\n");
 	#endif
 	}
 
@@ -2167,7 +2152,7 @@ void print_host_info(void)
   #ifdef USE_IMCI512	// 1st-gen Xeon Phi (KNF,KNC)
 
 	if(has_avx512()) {
-		ASSERT(HERE, 0, "Build uses AVX-512 instruction set, but only k1om / IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n");
+		ASSERT(0, "Build uses AVX-512 instruction set, but only k1om / IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n");
 	} else if(has_imci512()) {
 		printf("INFO: Build uses k1om / IMCI-512 instruction set.\n");
 	} else {
@@ -2178,13 +2163,13 @@ void print_host_info(void)
 		CPUID(1,0,a,b,c,d);
 		printf("has_imci512: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d);
 		printf("#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
-		ASSERT(HERE, 0, "#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
+		ASSERT(0, "#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
 	}
 
   #elif(defined(USE_AVX512))
 
 	if(has_imci512()) {
-		ASSERT(HERE, 0, "Build uses AVX512 instruction set, but only IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n");
+		ASSERT(0, "Build uses AVX512 instruction set, but only IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n");
 	} else 	if(has_avx512()) {
 		printf("INFO: Build uses AVX512 instruction set.\n");
 	} else {
@@ -2195,7 +2180,7 @@ void print_host_info(void)
 		CPUID(1,0,a,b,c,d);
 		printf("has_avx512: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d);
 		printf("#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
-		ASSERT(HERE, 0, "#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
+		ASSERT(0, "#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
 	}
 
   #elif(defined(USE_AVX2))
@@ -2212,7 +2197,7 @@ void print_host_info(void)
 		CPUID(1,0,a,b,c,d);
 		printf("has_avx2: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d);
 		printf("#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
-		ASSERT(HERE, 0, "#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
+		ASSERT(0, "#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
 	}
 
   #elif(defined(USE_AVX))
@@ -2222,7 +2207,7 @@ void print_host_info(void)
 	} else if(has_avx()) {
 		printf("INFO: Build uses AVX instruction set.\n");
 	} else {
-		ASSERT(HERE, 0, "#define USE_AVX invoked but no AVX support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
+		ASSERT(0, "#define USE_AVX invoked but no AVX support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
 	}
 
   #elif(defined(USE_SSE2))
@@ -2235,7 +2220,7 @@ void print_host_info(void)
 	if(has_sse2()) {
 		printf("INFO: Build uses SSE2 ... 'enhanced SSE2' supported by CPU: SSE[3,3e,4.1,4.2] = [%u,%u,%u,%u]\n",has_sse3(),has_sse3e(),has_sse41(),has_sse42());
 	} else {
-		ASSERT(HERE, 0, "#define USE_SSE2 invoked but no SSE2 support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
+		ASSERT(0, "#define USE_SSE2 invoked but no SSE2 support detected on this CPU! Check get_cpuid functionality and CPU type.\n");
 	}
 
   #else
@@ -2272,7 +2257,7 @@ void print_host_info(void)
 			printf("INFO: mkdir -p \"%s\" succeeded\n", MLUCAS_PATH);
 		} else {
 			fprintf(stderr, "ERROR: mkdir -p \"%s\" failed\n", MLUCAS_PATH);
-			ASSERT(HERE, 0, "Exiting.");
+			ASSERT(0, "Exiting.");
 		}
 	}
 }
@@ -2423,7 +2408,7 @@ For the purpose of completeness, the other FPU control bits are as follows
 	#else
 		unsigned short FPUCTRL;
 	#endif
-		ASSERT(HERE, (FPU_MODE == FPU_64RND) || (FPU_MODE == FPU_64CHOP), "Illegal value of FPU_MODE");
+		ASSERT((FPU_MODE == FPU_64RND) || (FPU_MODE == FPU_64CHOP), "Illegal value of FPU_MODE");
 
 		// Check the SIMD control word:
 	#ifdef USE_SSE2
@@ -2543,7 +2528,7 @@ For the purpose of completeness, the other FPU control bits are as follows
 			printf("INFO: compiler sets x87 FPU to [round ==> 0] (truncate) rounding mode. Overriding...Setting to [round ==> nearest].\n");
 			break;
 		default:
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 	}
 
@@ -2560,24 +2545,24 @@ void check_nbits_in_types(void)
 	double ln2 = LOG2;
 
 	/* Make sure TRUE and FALSE behave as required: */
-	ASSERT(HERE, !FALSE && TRUE, "TRUE and FALSE do not behave as required in check_nbits_in_types");
+	ASSERT(!FALSE && TRUE, "TRUE and FALSE do not behave as required in check_nbits_in_types");
 
 	/* Check lengths of basic data types: */
-    ASSERT(HERE, sizeof( int8 ) == 1, "sizeof( int8 ) != 1");
-    ASSERT(HERE, sizeof(uint8 ) == 1, "sizeof(uint8 ) != 1");
-    ASSERT(HERE, sizeof( int16) == 2, "sizeof( int16) != 2");
-    ASSERT(HERE, sizeof(uint16) == 2, "sizeof(uint16) != 2");
-    ASSERT(HERE, sizeof( int32) == 4, "sizeof( int32) != 4");
-    ASSERT(HERE, sizeof(uint32) == 4, "sizeof(uint32) != 4");
-    ASSERT(HERE, sizeof( int64) == 8, "sizeof( int64) != 8");
-    ASSERT(HERE, sizeof(uint64) == 8, "sizeof(uint64) != 8");
-    ASSERT(HERE, sizeof(uint64) >= sizeof(void*), "sizeof(long long) != sizeof(void*)");    /* ALIGN_DOUBLES assumes this. */
+    ASSERT(sizeof( int8 ) == 1, "sizeof( int8 ) != 1");
+    ASSERT(sizeof(uint8 ) == 1, "sizeof(uint8 ) != 1");
+    ASSERT(sizeof( int16) == 2, "sizeof( int16) != 2");
+    ASSERT(sizeof(uint16) == 2, "sizeof(uint16) != 2");
+    ASSERT(sizeof( int32) == 4, "sizeof( int32) != 4");
+    ASSERT(sizeof(uint32) == 4, "sizeof(uint32) != 4");
+    ASSERT(sizeof( int64) == 8, "sizeof( int64) != 8");
+    ASSERT(sizeof(uint64) == 8, "sizeof(uint64) != 8");
+    ASSERT(sizeof(uint64) >= sizeof(void*), "sizeof(long long) != sizeof(void*)");    /* ALIGN_DOUBLES assumes this. */
 
 	/* AltiVec vector types: */
 #if(CPU_HAS_ALTIVEC || CPU_IS_CELL)
-	ASSERT(HERE, sizeof(vec_uint8X16) == 16 , "sizeof(vec_uint8X16) != 16 ");
-	ASSERT(HERE, sizeof(vec_uint16X8) == 16 , "sizeof(vec_uint16x8) != 16 ");
-	ASSERT(HERE, sizeof(vec_uint32X4) == 16 , "sizeof(vec_uint32x4) != 16 ");
+	ASSERT(sizeof(vec_uint8X16) == 16 , "sizeof(vec_uint8X16) != 16 ");
+	ASSERT(sizeof(vec_uint16X8) == 16 , "sizeof(vec_uint16x8) != 16 ");
+	ASSERT(sizeof(vec_uint32X4) == 16 , "sizeof(vec_uint32x4) != 16 ");
 #endif
 
 	uint64 x = 0x0706050403020100ull;
@@ -2585,14 +2570,14 @@ void check_nbits_in_types(void)
 	// Runtime ordering is little-endian:
 	if(byte_arr[0] == 0 && byte_arr[1] == 1 && byte_arr[2] == 2 && byte_arr[3] == 3 && byte_arr[4] == 4 && byte_arr[5] == 5 && byte_arr[6] == 6 && byte_arr[7] == 7) {
 	  #ifdef USE_BIG_ENDIAN
-		ASSERT(HERE, 0, "USE_BIG_ENDIAN set in platform.h but little-endian detected at runtime!");
+		ASSERT(0, "USE_BIG_ENDIAN set in platform.h but little-endian detected at runtime!");
 	  #endif
 	} else if(byte_arr[0] == 7 && byte_arr[1] == 6 && byte_arr[2] == 5 && byte_arr[3] == 4 && byte_arr[4] == 3 && byte_arr[5] == 2 && byte_arr[6] == 1 && byte_arr[7] == 0) {
 	  #ifndef USE_BIG_ENDIAN
-		ASSERT(HERE, 0, "USE_BIG_ENDIAN not set in platform.h but big-endian detected at runtime!");
+		ASSERT(0, "USE_BIG_ENDIAN not set in platform.h but big-endian detected at runtime!");
 	  #endif
 	} else {
-		ASSERT(HERE, 0, "Endianness detected as neither big nor little-endian at runtime!");
+		ASSERT(0, "Endianness detected as neither big nor little-endian at runtime!");
 	}
 
 	// Init RNG:
@@ -2659,10 +2644,10 @@ void check_nbits_in_types(void)
 #else
 
 	sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, pi  = %20.3f,  DNINT(pi ) = %20.3f\n", RND_A, tpi, (double)DNINT(tpi));
-	ASSERT(HERE, (double)DNINT(tpi) == 3.0, cbuf);
+	ASSERT((double)DNINT(tpi) == 3.0, cbuf);
 
 	sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, ln2 = %20.3f,  DNINT(ln2) = %20.3f\n", RND_A, ln2, (double)DNINT(ln2));
-	ASSERT(HERE, (double)DNINT(ln2) == 1.0, cbuf);
+	ASSERT((double)DNINT(ln2) == 1.0, cbuf);
 
 #endif
 
@@ -2684,11 +2669,11 @@ to original "fiddle these depending on exponent being tested" scheme. */
 
 	FFT_MUL_BASE = (double)((uint64)1 << FFT_MUL_BITS);
 /* Intend to relax this later to allow powers of 2 as large as 2^54: */
-ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
+ASSERT(((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 
-	ASSERT(HERE, trailz64((uint64)FFT_MUL_BASE) == FFT_MUL_BITS, "mi64_cvt_double_uint64: trailz64((uint64)FFT_MUL_BASE) != FFT_MUL_BITS");
-	ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_double_uint64: FFT_MUL_BASE not pure-integer!");
-	ASSERT(HERE, FFT_MUL_BASE < 1.0*0x8000000*0x8000000, "mi64_cvt_double_uint64: FFT_MUL_BASE >= maximum allowed value of 2^54!");
+	ASSERT(trailz64((uint64)FFT_MUL_BASE) == FFT_MUL_BITS, "mi64_cvt_double_uint64: trailz64((uint64)FFT_MUL_BASE) != FFT_MUL_BITS");
+	ASSERT(DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_double_uint64: FFT_MUL_BASE not pure-integer!");
+	ASSERT(FFT_MUL_BASE < 1.0*0x8000000*0x8000000, "mi64_cvt_double_uint64: FFT_MUL_BASE >= maximum allowed value of 2^54!");
 	FFT_MUL_BASE_INV = 1.0/FFT_MUL_BASE;
 
   #if FAST_UINT32_MOD
@@ -2758,32 +2743,32 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 		}
 	}
 	printf ("%u cases of %u [%6.2f%%] needed adjustment.\n",nneg,ntry,100.*nneg/(float)ntry);
-	ASSERT(HERE, nfail == 0, "Fast-uint32-mod test failed for 1 or more inputs!");
+	ASSERT(nfail == 0, "Fast-uint32-mod test failed for 1 or more inputs!");
   #endif	// #if FAST_UINT32_MOD ?
 
 	/* Test approximate 1/x and 1/sqrt(x) routines: */
-	ftmp = finvest(1.5,  8);	/*fprintf(stderr, "finvest(1.5,  8) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/	ASSERT(HERE, fabs(ftmp - 0.666666666666667) < 4e-03, "Unacceptable level of error in finvest() call!");
-	ftmp = finvest(1.5, 53);	/*fprintf(stderr, "finvest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/	ASSERT(HERE, fabs(ftmp - 0.666666666666667) < 1e-14, "Unacceptable level of error in finvest() call!");
-	ftmp = finvest(1.0, 53);	/*fprintf(stderr, "finvest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
-	ftmp = finvest(2.0, 53);	/*fprintf(stderr, "finvest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.500000000000000));*/	ASSERT(HERE, fabs(ftmp - 0.500000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
-	ftmp = finvest(0.5, 53);	/*fprintf(stderr, "finvest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
-	ftmp = finvest(.75, 53);	/*fprintf(stderr, "finvest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.333333333333333));*/	ASSERT(HERE, fabs(ftmp - 1.333333333333333) < 1e-14, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(1.5,  8);	/*fprintf(stderr, "finvest(1.5,  8) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/	ASSERT(fabs(ftmp - 0.666666666666667) < 4e-03, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(1.5, 53);	/*fprintf(stderr, "finvest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/	ASSERT(fabs(ftmp - 0.666666666666667) < 1e-14, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(1.0, 53);	/*fprintf(stderr, "finvest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/	ASSERT(fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(2.0, 53);	/*fprintf(stderr, "finvest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.500000000000000));*/	ASSERT(fabs(ftmp - 0.500000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(0.5, 53);	/*fprintf(stderr, "finvest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/	ASSERT(fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(.75, 53);	/*fprintf(stderr, "finvest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.333333333333333));*/	ASSERT(fabs(ftmp - 1.333333333333333) < 1e-14, "Unacceptable level of error in finvest() call!");
 	/* Try some large and small inputs: */
-	ftmp = finvest(3.141592653589793e+15, 53);	/*fprintf(stderr, "finvest(3.141592653589793e+15, 53) gives err = %20.10e\n", fabs(ftmp - 3.183098861837907e-16));*/	ASSERT(HERE, fabs(ftmp - 3.183098861837907e-16) < 1e-14, "Unacceptable level of error in finvest() call!");
-	ftmp = finvest(3.183098861837907e-16, 53);	/*fprintf(stderr, "finvest(3.183098861837907e-16, 53) gives err = %20.10e\n", fabs(ftmp - 3.141592653589793e+15));*/	ASSERT(HERE, fabs(ftmp - 3.141592653589793e+15) < 1e+00, "Unacceptable level of error in finvest() call!");
-
-	ftmp = fisqrtest(1.5,  8);	/*fprintf(stderr, "fisqrtest(1.5,  8) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/	ASSERT(HERE, fabs(ftmp - 0.816496580927726) < 1e-3 , "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(1.5, 53);	/*fprintf(stderr, "fisqrtest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/	ASSERT(HERE, fabs(ftmp - 0.816496580927726) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(1.0, 53);	/*fprintf(stderr, "fisqrtest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(2.0, 53);	/*fprintf(stderr, "fisqrtest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.707106781186548));*/	ASSERT(HERE, fabs(ftmp - 0.707106781186548) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(0.5, 53);	/*fprintf(stderr, "fisqrtest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 1.414213562373095));*/	ASSERT(HERE, fabs(ftmp - 1.414213562373095) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(0.3, 53);	/*fprintf(stderr, "fisqrtest(0.3, 53) gives err = %20.10e\n", fabs(ftmp - 1.825741858350554));*/	ASSERT(HERE, fabs(ftmp - 1.825741858350554) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(.25, 53);	/*fprintf(stderr, "fisqrtest(.25, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/	ASSERT(HERE, fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(.75, 53);	/*fprintf(stderr, "fisqrtest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.154700538379251));*/	ASSERT(HERE, fabs(ftmp - 1.154700538379251) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(3.0, 53);	/*fprintf(stderr, "fisqrtest(3.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.577350269189626));*/	ASSERT(HERE, fabs(ftmp - 0.577350269189626) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = finvest(3.141592653589793e+15, 53);	/*fprintf(stderr, "finvest(3.141592653589793e+15, 53) gives err = %20.10e\n", fabs(ftmp - 3.183098861837907e-16));*/	ASSERT(fabs(ftmp - 3.183098861837907e-16) < 1e-14, "Unacceptable level of error in finvest() call!");
+	ftmp = finvest(3.183098861837907e-16, 53);	/*fprintf(stderr, "finvest(3.183098861837907e-16, 53) gives err = %20.10e\n", fabs(ftmp - 3.141592653589793e+15));*/	ASSERT(fabs(ftmp - 3.141592653589793e+15) < 1e+00, "Unacceptable level of error in finvest() call!");
+
+	ftmp = fisqrtest(1.5,  8);	/*fprintf(stderr, "fisqrtest(1.5,  8) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/	ASSERT(fabs(ftmp - 0.816496580927726) < 1e-3 , "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(1.5, 53);	/*fprintf(stderr, "fisqrtest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/	ASSERT(fabs(ftmp - 0.816496580927726) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(1.0, 53);	/*fprintf(stderr, "fisqrtest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/	ASSERT(fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(2.0, 53);	/*fprintf(stderr, "fisqrtest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.707106781186548));*/	ASSERT(fabs(ftmp - 0.707106781186548) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(0.5, 53);	/*fprintf(stderr, "fisqrtest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 1.414213562373095));*/	ASSERT(fabs(ftmp - 1.414213562373095) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(0.3, 53);	/*fprintf(stderr, "fisqrtest(0.3, 53) gives err = %20.10e\n", fabs(ftmp - 1.825741858350554));*/	ASSERT(fabs(ftmp - 1.825741858350554) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(.25, 53);	/*fprintf(stderr, "fisqrtest(.25, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/	ASSERT(fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(.75, 53);	/*fprintf(stderr, "fisqrtest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.154700538379251));*/	ASSERT(fabs(ftmp - 1.154700538379251) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(3.0, 53);	/*fprintf(stderr, "fisqrtest(3.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.577350269189626));*/	ASSERT(fabs(ftmp - 0.577350269189626) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
 	/* Try some large and small inputs: */
-	ftmp = fisqrtest(3.141592653589793e+15, 53);	/*fprintf(stderr, "fisqrtest(3.141592653589793e+15, 53); gives err = %20.10e\n", fabs(ftmp - 1.784124116152771e-08));*/	ASSERT(HERE, fabs(ftmp - 1.784124116152771e-08) < 1e-22, "Unacceptable level of error in fisqrtest() call!");
-	ftmp = fisqrtest(3.183098861837907e-16, 53);	/*fprintf(stderr, "fisqrtest(3.183098861837907e-16, 53); gives err = %20.10e\n", fabs(ftmp - 5.604991216397928e+07));*/	ASSERT(HERE, fabs(ftmp - 5.604991216397928e+07) < 1e-07, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(3.141592653589793e+15, 53);	/*fprintf(stderr, "fisqrtest(3.141592653589793e+15, 53); gives err = %20.10e\n", fabs(ftmp - 1.784124116152771e-08));*/	ASSERT(fabs(ftmp - 1.784124116152771e-08) < 1e-22, "Unacceptable level of error in fisqrtest() call!");
+	ftmp = fisqrtest(3.183098861837907e-16, 53);	/*fprintf(stderr, "fisqrtest(3.183098861837907e-16, 53); gives err = %20.10e\n", fabs(ftmp - 5.604991216397928e+07));*/	ASSERT(fabs(ftmp - 5.604991216397928e+07) < 1e-07, "Unacceptable level of error in fisqrtest() call!");
 
 	/* Now do a whole mess of 'em: */
 	for(i = 0; i < 100000; i++)
@@ -2794,24 +2779,24 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 			ftmp = finvest  (fran, 53);
 			finv = 1.0/fran;
 			ferr = (ftmp - finv)/(ftmp + finv);
-			ASSERT(HERE, fabs(ferr) < 1e-14, "Unacceptable level of error in finvest  () call!");
+			ASSERT(fabs(ferr) < 1e-14, "Unacceptable level of error in finvest  () call!");
 
 			ftmp = fisqrtest(fran, 53);
 			fsrt = 1.0/sqrt(fran);
 			ferr = (ftmp - fsrt)/(ftmp + fsrt);
-			ASSERT(HERE, fabs(ferr) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
+			ASSERT(fabs(ferr) < 1e-14, "Unacceptable level of error in fisqrtest() call!");
 		}
 
 		fran = rng_isaac_rand_double_norm_pos();
 		if(fran < 0.0 || fran >= 1.0) {
 			sprintf(cbuf, "check_nbits_in_types: rng_isaac_rand_double_norm_pos returns illegal value outside [0, 1): i = %d, %e\n", i,fran);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		}
 
 		fran = rng_isaac_rand_double_norm_pm1();
 		if(fabs(fran) >= 1.0) {
 			sprintf(cbuf, "check_nbits_in_types: rng_isaac_rand_double_norm_pm1 returns illegal value outside (-1,+1): i = %d, %e\n", i, fran);
-			ASSERT(HERE, 0, cbuf);
+			ASSERT(0, cbuf);
 		}
 	}
 
@@ -2823,7 +2808,7 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 	if(!nerr)
 		printf("fma_dmult_tests completed successfully!\n");
 	else
-		ASSERT(HERE, 0, "fma_dmult_tests failed!\n");
+		ASSERT(0, "fma_dmult_tests failed!\n");
 	*/
 #endif
 
@@ -2851,7 +2836,7 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 	{
 		order = 1ull << i;
 		prim_root_q(order, &root_re,&root_im);
-	//	printf("FGT: prim-root of order 2^%2u = %llu + I*%llu\n",i, root_re,root_im);
+	//	printf("FGT: prim-root of order 2^%2u = %" PRIu64 " + I*%" PRIu64 "\n",i, root_re,root_im);
 		// Check order-primitivity of roots of order > 1 by powering result up to 2nd order; result must == -1 (mod q):
 		if(i > 0) {
 			for(j = 1; j < i; j++) {
@@ -2859,9 +2844,9 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 				root_re = qreduce(root_re);	root_im = qreduce(root_im);	// Only partially reduce intermediates...
 			}
 			root_re = qreduce_finish(root_re);	root_im = qreduce_finish(root_im);	// ...and then finish reducing here.
-			ASSERT(HERE, root_re ==  q-1 && root_im == 0ull, "Bad prim_root_q result!");
+			ASSERT(root_re ==  q-1 && root_im == 0ull, "Bad prim_root_q result!");
 		} else {
-			ASSERT(HERE, root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!");
+			ASSERT(root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!");
 		}
 	}
 
@@ -2870,18 +2855,18 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 	// Power-of-2 roots satisfy simple conjugate rule, modular analog of complex conj(Re,Im) = (Re,-Im):
 	order = 16;	prim_root_q(order, &root_re,&root_im);
 	pow_modq(order-1, root_re,root_im, &re,&im);
-	printf("FGT: prim-root of order %u = %llu + I*%llu, Conjugate = %llu + I*%llu [q-Im = %llu]\n",(uint32)order, root_re,root_im, re,im,q-im);
+	printf("FGT: prim-root of order %u = %" PRIu64 " + I*%" PRIu64 ", Conjugate = %" PRIu64 " + I*%" PRIu64 " [q-Im = %" PRIu64 "]\n",(uint32)order, root_re,root_im, re,im,q-im);
 //	FGT: prim-root of order 16 = 1693317751237720973 + I*2283815672160731785,
 //					Conjugate =  1693317751237720973 + I*  22027337052962166 [q-Im = 2283815672160731785]
-	ASSERT(HERE, root_re == re && root_im == (q-im), "Bad power-of-2 conjugate!");
+	ASSERT(root_re == re && root_im == (q-im), "Bad power-of-2 conjugate!");
 
 	// Non-power-of-2 roots satisfy no simple conjugate rules, so multiply root and its conjugate together as sanity check:
 	order = 24;	prim_root_q(order, &root_re,&root_im);
 	pow_modq(order-1, root_re,root_im, &re,&im);
-	printf("FGT: prim-root of order %u = %llu + I*%llu, Conjugate = %llu + I*%llu [q-Im = %llu]\n",(uint32)order, root_re,root_im, re,im,q-im);
+	printf("FGT: prim-root of order %u = %" PRIu64 " + I*%" PRIu64 ", Conjugate = %" PRIu64 " + I*%" PRIu64 " [q-Im = %" PRIu64 "]\n",(uint32)order, root_re,root_im, re,im,q-im);
 	cmul_modq(root_re,root_im, re,im, &re,&im);
 	re = qreduce_full(re);	im = qreduce_full(im);
-	ASSERT(HERE, re == 1ull && im == 0ull, "Bad non-power-of-2 conjugate!");
+	ASSERT(re == 1ull && im == 0ull, "Bad non-power-of-2 conjugate!");
 /*
 	24th root:
 	FGT: prim-root of order 24 = 244692701471512749 + I*2061150307742181202,
@@ -2895,7 +2880,7 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16");
 	printf("Powers of prim-root:\n");
 	re = root_re;	im = root_im;
 	for(i = 0; i < order; i++) {
-		printf("%2u: %20llu[-= %20llu] + I*%20llu[-= %20llu]\n",i+1, re,q-re,im,q-im);
+		printf("%2u: %20" PRIu64 "[-= %20" PRIu64 "] + I*%20" PRIu64 "[-= %20" PRIu64 "]\n",i+1, re,q-re,im,q-im);
 		cmul_modq(root_re,root_im, re,im, &re,&im);
 		re = qreduce_full(re);	im = qreduce_full(im);
 	}
@@ -2943,11 +2928,11 @@ The four [+-d,+-d] and four powers of I are just the eight 8th roots of unity wh
 	{
 		order *= odd_ord_facs[i];
 		prim_root_q(order, &root_re,&root_im);
-	//	printf("FGT: prim-root of order %llu = %llu + I*%llu\n",order, root_re,root_im);
-		ASSERT(HERE, root_im == 0ull, "Odd roots must be strictly real!!");
+	//	printf("FGT: prim-root of order %" PRIu64 " = %" PRIu64 " + I*%" PRIu64 "\n",order, root_re,root_im);
+		ASSERT(root_im == 0ull, "Odd roots must be strictly real!!");
 		// Check order-primitivity of roots by raising result to (order)th power; result must == -1 (mod q):
 		pow_modq(order, root_re,root_im, &root_re,&root_im);
-		ASSERT(HERE, root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!");
+		ASSERT(root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!");
 	}
 	printf("fgt_m61 tests completed successfully!\n");
 #endif
@@ -3029,7 +3014,7 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562
 		int i,j,k, pow2;
 		double pow2_dmult,pow2_imult;
 		uint32 nerr = 0, itmp32;
-		const double crnd = 3.0*0x4000000*0x2000000, crnd50 = crnd*TWO50FLOAT;	// Consts used to emulate DNINT(x) and 2^50 * DNINT(x*2^-50)
+		const double crnd = 3.0*0x4000000*0x2000000; double crnd50 = crnd*TWO50FLOAT;	// Consts used to emulate DNINT(x) and 2^50 * DNINT(x*2^-50)
 										// (i.e. round-to-nearest-multiple-of-2^50 ... alas the AVX-512 VRNDSCALEPD instruction only supports
 										// round-to-nearest-multiple-of-negative-power-of-2, and said power is further restricted to pow < 16.
 		static vec_dbl *sc_arr = 0x0;
@@ -3037,12 +3022,12 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562
 		double *tmp, *dptr1,*dptr2,*dptr3,*dptr4, l2lo,l2hi, dblo,dbhi, sqr100lo[4],sqr100hi[4], dtmp,cy_max;
 		static double *ax,*bx,*cx,*dx, *ay,*by,*cy,*dy, *alo,*blo,*clo,*dlo, *ahi,*bhi,*chi,*dhi, *acy,*alo_norm,*ahi_norm;
 		uint64 itmp64, iax,ibx,icx,idx, iay,iby,icy,idy, ialo,iblo,iclo,idlo, iahi,ibhi,ichi,idhi;
-		const double prod1_adj = 3.0;	// Const to multiply by base and add to prod[1] to ensure latter >= 0
+		/* const */ double prod1_adj = 3.0;	// Const to multiply by base and add to prod[1] to ensure latter >= 0
 		if(!sc_arr) {
 			sc_arr = ALLOC_VEC_DBL(sc_arr, 8);
-			if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+			if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 			sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr);
-			ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+			ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 			/* Remember, rhese are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */
 			tmp = (double *)sc_ptr;
 			ax  = tmp + 0;	bx  = tmp + 1;	cx  = tmp + 2;	dx  = tmp + 3;	tmp += 4;
@@ -3062,7 +3047,7 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562
 		for(pow2 = 48; pow2 < 54; ++pow2) {
 			// Only makes sense to test up the #bits in an IEEE-double mantissa: Any larger and we start losing
 			// LSBs (I.e. the test may 'succeed' for pow2 > 53, but is only testing the equivalent of pow2 = 53):
-			ASSERT(HERE, pow2 < 54, "No point testing > 53-bit inputs due to loss of LSBs!");
+			ASSERT(pow2 < 54, "No point testing > 53-bit inputs due to loss of LSBs!");
 			printf("Testing fma_dmult for %d bits, dmult = %f:\n",pow2,pow2_dmult);
 			l2lo = l2hi = cy_max = 0.;	// Init log2-range-bounds-storing vars
 			for(j = 0; j < 4; j++) {
@@ -3143,25 +3128,25 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562
 			  #endif
 			  /*
 				if(pow2 == 53 && i < 100) {
-					printf("I = %d: ax = %llu ay = %llu ahi,alo = %f,%f\n",i, *ax,*ay, *ahi,*alo);
-					printf("I = %d: bx = %llu by = %llu bhi,blo = %f,%f\n",i, *bx,*by, *bhi,*blo);
-					printf("I = %d: cx = %llu cy = %llu chi,clo = %f,%f\n",i, *cx,*cy, *chi,*clo);
-					printf("I = %d: dx = %llu dy = %llu dhi,dlo = %f,%f\n",i, *dx,*dy, *dhi,*dlo);
+					printf("I = %d: ax = %" PRIu64 " ay = %" PRIu64 " ahi,alo = %f,%f\n",i, *ax,*ay, *ahi,*alo);
+					printf("I = %d: bx = %" PRIu64 " by = %" PRIu64 " bhi,blo = %f,%f\n",i, *bx,*by, *bhi,*blo);
+					printf("I = %d: cx = %" PRIu64 " cy = %" PRIu64 " chi,clo = %f,%f\n",i, *cx,*cy, *chi,*clo);
+					printf("I = %d: dx = %" PRIu64 " dy = %" PRIu64 " dhi,dlo = %f,%f\n",i, *dx,*dy, *dhi,*dlo);
 				}
 			  */
-				if(cmp_fma_lohi_vs_exact(*ax,*ay,*ahi,*alo, iax,iay,iahi,ialo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, A-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
-				if(cmp_fma_lohi_vs_exact(*bx,*by,*bhi,*blo, ibx,iby,ibhi,iblo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, B-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
-				if(cmp_fma_lohi_vs_exact(*cx,*cy,*chi,*clo, icx,icy,ichi,iclo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, C-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
-				if(cmp_fma_lohi_vs_exact(*dx,*dy,*dhi,*dlo, idx,idy,idhi,idlo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, D-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); }
+				if(cmp_fma_lohi_vs_exact(*ax,*ay,*ahi,*alo, iax,iay,iahi,ialo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, A-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); }
+				if(cmp_fma_lohi_vs_exact(*bx,*by,*bhi,*blo, ibx,iby,ibhi,iblo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, B-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); }
+				if(cmp_fma_lohi_vs_exact(*cx,*cy,*chi,*clo, icx,icy,ichi,iclo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, C-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); }
+				if(cmp_fma_lohi_vs_exact(*dx,*dy,*dhi,*dlo, idx,idy,idhi,idlo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, D-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); }
 			  #if 0
 				#error to-do!
 				double r1,r2, lo,hi;
 				r1 = rng_isaac_rand_double_norm_pm1() * pow2_dmult;	// in [-2^50, +2^50]
 				r2 = rng_isaac_rand_double_norm_pm1() * pow2_dmult;	// in [-2^50, +2^50]
 				mul50x50_debug(r1,r2, &lo,&hi);
-				printf("mul50x50_: a,b = %llu, %llu\n",*(uint64*)&r1,*(uint64*)&r2);
-				printf("mul50x50_: lo = %16llu\n",*(uint64*)alo);
-				printf("mul50x50_: hi = %16llu\n",*(uint64*)ahi);
+				printf("mul50x50_: a,b = %" PRIu64 ", %" PRIu64 "\n",*(uint64*)&r1,*(uint64*)&r2);
+				printf("mul50x50_: lo = %16" PRIu64 "\n",*(uint64*)alo);
+				printf("mul50x50_: hi = %16" PRIu64 "\n",*(uint64*)ahi);
 			  #endif
 
 			/******************** experimental code: Try squaring [lo,hi] (in ymm1,2), sans intermediate base-normalizations: *******************/
@@ -3240,10 +3225,10 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562
 			// Use 1.0f as format - .0 means no fractional part, and i/o routines will override the length-1 with actual length:
 			if(cy_max > 0) {
 				itmp64 = cy_max; itmp32 = trailz64(itmp64); itmp64 >>= itmp32;
-				printf("\tcy_max = %1.0f =  %llu * 2^%u\n",cy_max,itmp64,itmp32);
+				printf("\tcy_max = %1.0f =  %" PRIu64 " * 2^%u\n",cy_max,itmp64,itmp32);
 			} else if(cy_max < 0) {
 				itmp64 =-cy_max; itmp32 = trailz64(itmp64); itmp64 >>= itmp32;
-				printf("\tcy_max = %1.0f = -%llu * 2^%u\n",cy_max,itmp64,itmp32);
+				printf("\tcy_max = %1.0f = -%" PRIu64 " * 2^%u\n",cy_max,itmp64,itmp32);
 			} else {
 				printf("\tcy_max =  0\n");
 			}
@@ -3317,9 +3302,9 @@ void	mul50x50_debug(double a, double b, double *lo, double *hi)
 		if(retval) {
 			printf("In cmp_fma_lohi_vs_exact: FMA-double and pure-int DMUL results differ!\n");
 			printf("dx = %f; dy = %f; hi,lo = %f,%f\n",dx,dy, dhi * (1 - 2*(s1 != 0)), dlo * (1 - 2*(s0 != 0)));
-			printf("ix = %lld; iy = %lld; ihi,lo = %lld,%llu\n",ix,iy, ihi,ilo);
-			printf("Unsigned FMA result: ihi = %llX; ilo = %llX\n",*(uint64*)&dhi,*(uint64*)&dlo);
-			printf("nsh1,0 = %d,%d: ehi = %llu; elo = %llu [mlo = %c%llu]\n",nsh1,nsh0,exact.d1,exact.d0, char_sgn[s1 ^ s0],m0);
+			printf("ix = %" PRId64 "; iy = %" PRId64 "; ihi,lo = %" PRId64 ",%" PRIu64 "\n",ix,iy, ihi,ilo);
+			printf("Unsigned FMA result: ihi = %" PRIX64 "; ilo = %" PRIX64 "\n",*(uint64*)&dhi,*(uint64*)&dlo);
+			printf("nsh1,0 = %d,%d: ehi = %" PRIu64 "; elo = %" PRIu64 " [mlo = %c%" PRIu64 "]\n",nsh1,nsh0,exact.d1,exact.d0, char_sgn[s1 ^ s0],m0);
 		}
 		return retval;
 	}
@@ -3386,7 +3371,7 @@ much except as an auxiliary utility.
 uint32 reverse(uint32 i, uint32 nbits)
 {
 	uint32 j, tmp = 0;
-	ASSERT(HERE,nbits <= 32,"ERROR: bitlength limit 32 exceeded in call to REVERSE.\n");
+	ASSERT(nbits <= 32,"ERROR: bitlength limit 32 exceeded in call to REVERSE.\n");
 	for(j = 0; j < nbits; j++) {
 		tmp += tmp + (i & 1);
 		i >>= 1;
@@ -3595,7 +3580,7 @@ int ith_set_bit32(uint32 x, uint32 bit)
 	uint8 curr_byte;
 	int curr_pop,i,j,k,retval = 0;
 	if(!x || !bit) return -1;
-	ASSERT(HERE, bit <= 32, "[bit]th-bit specifier out of range!");
+	ASSERT(bit <= 32, "[bit]th-bit specifier out of range!");
 	// Find the byte in which the [bit]th set-bit occurs:
 	for(i = 0; i < 32; i += 8) {
 		curr_byte = (uint8)(x >> i);
@@ -3619,7 +3604,7 @@ int ith_set_bit64(uint64 x, uint32 bit)
 	uint8 curr_byte;
 	int curr_pop,i,j,k,retval = 0;
 	if(!x || !bit) return -1;
-	ASSERT(HERE, bit <= 64, "[bit]th-bit specifier out of range!");
+	ASSERT(bit <= 64, "[bit]th-bit specifier out of range!");
 	// Find the byte in which the [bit]th set-bit occurs:
 	for(i = 0; i < 64; i += 8) {
 		curr_byte = (uint8)(x >> i);
@@ -3901,7 +3886,7 @@ DEV uint64	getbits64(uint64 x, uint32 src_bit_start, uint32 nbits, uint32 tgt_bi
 {
 	const uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull;
 	uint64 mask;
-	ASSERT(HERE, (nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!");
+	ASSERT((nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!");
 	if(nbits == 0) return 0;
 	mask = (ones_mask >> (64-nbits));
 	return ((x >> src_bit_start) & mask) << tgt_bit_start;
@@ -3914,7 +3899,7 @@ DEV void	mvbits64(uint64 x, uint32 src_bit_start, uint32 nbits, uint64*y, uint32
 {
 	const uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull;
 	uint64 mask;
-	ASSERT(HERE, (nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!");
+	ASSERT((nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!");
 	if(nbits == 0) return;
 	mask = (ones_mask >> (64-nbits));
 	/* Zero out the target bits: */
@@ -3991,7 +3976,7 @@ DEV uint32 is_prime(uint32 n) {
 // Get nearest Fermat 2-PRP to N in the specified search direction, up or down. Algorithm is slow try-next-odd:
 DEV uint32 next_prime(uint32 n, int dir) {
 	// direction properly specified?
-	ASSERT(HERE, ABS(dir) == 1,"next_prime(): Direction of search not properly specified, must = +1 (up) or -1 (down).");
+	ASSERT(ABS(dir) == 1,"next_prime(): Direction of search not properly specified, must = +1 (up) or -1 (down).");
 	// Some special-casing for small n:
 	if(n <= 3 && dir == -1) {
 		return(2*(n == 3));
@@ -4124,7 +4109,7 @@ DEV uint32 twompmodq32(uint32 p, uint32 q)	// 2^-p % q
 	 int32 j;
 	uint32 lead5, pshift, qhalf, qinv, zshift, start_index, x, lo, hi;
 
-	ASSERT(HERE, (q&1) == 1, "twompmodq32: even modulus!");
+	ASSERT((q&1) == 1, "twompmodq32: even modulus!");
 	qhalf = q >> 1;	/* = (q-1)/2, since q odd. */
 
 	pshift = p + 32;
@@ -4166,7 +4151,7 @@ DEV uint32 twompmodq32(uint32 p, uint32 q)	// 2^-p % q
 
 	if((pshift >> j) & (uint32)1)
 	{
-		DBG_ASSERT(HERE, x < q,"util.c: x < q");
+		DBG_ASSERT(x < q,"util.c: x < q");
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(x > qhalf) {
 			x += x;
@@ -4201,7 +4186,7 @@ DEV int twopmodq32(uint32 p, uint32 q)	// (2^-p % q) == 0
 	 int32 j;
 	uint32 lead5, pshift, qhalf, qinv, zshift, start_index, x, lo, hi;
 
-	ASSERT(HERE, (q&1) == 1, "twopmodq32: even modulus!");
+	ASSERT((q&1) == 1, "twopmodq32: even modulus!");
 	qhalf = q >> 1;	/* = (q-1)/2, since q odd. */
 	pshift = p + 32;
 	if(pshift < p)	/* Need special-casing for p just below 2^32  - the primes 2^32-(5,17) are good testcases here. */
@@ -4253,7 +4238,7 @@ DEV int twopmodq32(uint32 p, uint32 q)	// (2^-p % q) == 0
 
 	if((pshift >> j) & (uint32)1)
 	{
-		DBG_ASSERT(HERE, x < q,"util.c: x < q");
+		DBG_ASSERT(x < q,"util.c: x < q");
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
 		if(x > qhalf) {
 			x += x;
@@ -4314,7 +4299,7 @@ DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uin
 	uint32 lead6, pshift6, qinv6, zshift6, x6, lo6, hi6, qhalf6;
 	uint32 lead7, pshift7, qinv7, zshift7, x7, lo7, hi7, qhalf7;
 
-	DBG_ASSERT(HERE, (q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq32_x8: Inputs nonmonotone!");
+	DBG_ASSERT((q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq32_x8: Inputs nonmonotone!");
 
 	qhalf0 = q0 >> 1;	/* = (q-1)/2, since q odd. */
 	qhalf1 = q1 >> 1;
@@ -4445,14 +4430,14 @@ DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uin
 	x7  = q7 - lo7;
 
 	/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
-	if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(HERE, x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); }
-	if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(HERE, x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); }
-	if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(HERE, x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); }
-	if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(HERE, x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); }
-	if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(HERE, x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); }
-	if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(HERE, x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); }
-	if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(HERE, x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); }
-	if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(HERE, x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); }
+	if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); }
+	if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); }
+	if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); }
+	if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); }
+	if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); }
+	if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); }
+	if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); }
+	if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); }
 
 	for(j = start_index-2; j >= 0; j--)
 	{
@@ -4493,14 +4478,14 @@ DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uin
 		x7 = hi7 - lo7 + ((-(hi7 < lo7)) & q7);
 
 		/* Combines overflow-on-add and need-to-subtract-q-from-sum checks */
-		if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(HERE, x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); }
-		if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(HERE, x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); }
-		if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(HERE, x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); }
-		if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(HERE, x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); }
-		if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(HERE, x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); }
-		if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(HERE, x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); }
-		if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(HERE, x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); }
-		if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(HERE, x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); }
+		if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); }
+		if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); }
+		if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); }
+		if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); }
+		if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); }
+		if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); }
+		if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); }
+		if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); }
 	}
 
 	/*...Double and return.	These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */
@@ -4565,9 +4550,9 @@ DEV uint32 egcd32_B(int32 *x, int32 *y)
 	int32 d, e, f;
 
 	if(*x == *y) {
-		printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x);	ASSERT(HERE, 0,"0");
+		printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x);	ASSERT(0,"0");
 	} else if((*x == 0) || (*y == 0)) {
-		printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y);		ASSERT(HERE, 0,"0");
+		printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y);		ASSERT(0,"0");
 	}
 
 	while(w) {
@@ -4598,9 +4583,9 @@ DEV uint32 egcd32(uint32 *x, uint32 *y)
 	uint32 d, e, f;
 
 	if(*x == *y) {
-		printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x);	ASSERT(HERE, 0,"0");
+		printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x);	ASSERT(0,"0");
 	} else if((*x == 0) || (*y == 0)) {
-		printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y);		ASSERT(HERE, 0,"0");
+		printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y);		ASSERT(0,"0");
 	}
 
 	while(w)
@@ -4638,9 +4623,9 @@ DEV uint64 egcd64(uint64 *x, uint64 *y)
 	/* Sign of these 3 doesn't matter since they're just temporaries: */
 	uint64 d, e, f;
 	if(*x == *y) {
-		printf("ERROR: eGCD of identical arguments x = y = %llu is illegal!\n", *x);	ASSERT(HERE, 0,"0");
+		printf("ERROR: eGCD of identical arguments x = y = %" PRIu64 " is illegal!\n", *x);	ASSERT(0,"0");
 	} else if((*x | *y) == 0ull) {
-		printf("ERROR: eGCD called with zero input: x = %llu, y = %llu\n", *x, *y);		ASSERT(HERE, 0,"0");
+		printf("ERROR: eGCD called with zero input: x = %" PRIu64 ", y = %" PRIu64 "\n", *x, *y);		ASSERT(0,"0");
 	}
 	while(w) {
 		q = g/w;
@@ -4672,8 +4657,8 @@ int32 x2 = z, y2 = n, gcd2;
 	if(x2 < 0)	// since egcd32() only does positive-result normalization on x-output, only do it here to the egcd32_B x-output
 		x2 += n;
 	if(gcd != gcd2 || x != x2 || y != y2)
-		ASSERT(HERE, 0,"2 gcd results in modinv32 differ!");
-	ASSERT(HERE, gcd == 1,"gcd in modinv32 is non-unity!");
+		ASSERT(0,"2 gcd results in modinv32 differ!");
+	ASSERT(gcd == 1,"gcd in modinv32 is non-unity!");
 	return x;
 }
 
@@ -4681,7 +4666,7 @@ DEV int64 modinv64(uint64 z, uint64 n)
 {
 	uint64 x = z, y = n, gcd;
 	gcd = egcd64(&x, &y);
-	ASSERT(HERE, gcd == 1ull,"gcd in modinv64 is non-unity!");
+	ASSERT(gcd == 1ull,"gcd in modinv64 is non-unity!");
 	return x;
 }
 
@@ -4884,7 +4869,7 @@ uint32 x128_div_y32(uint128 *x, uint32 y)
 		cy = (two64mody >> 63);
 		two64mody += (-cy) & y;
 		two64divy += (cy == 0);
-/*printf("INIT: two64divy, two64mody = %20llu %20llu\n\n", two64divy, two64mody); */
+/*printf("INIT: two64divy, two64mody = %20" PRIu64 " %20" PRIu64 "\n\n", two64divy, two64mody); */
 	}
 
 	/* Divide high digit by y, storing remainder in cy: */
@@ -4902,7 +4887,7 @@ uint32 x128_div_y32(uint128 *x, uint32 y)
 	prior to dividing risks unsigned integer overflow:
 	*/
 	(x->d0) = cy*two64divy + tsum/y + (x->d0)/y;
-/*printf("%20llu %20llu %2llu %2llu\n", x->d1, x->d0, cy, rem); */
+/*printf("%20" PRIu64 " %20" PRIu64 " %2" PRIu64 " %2" PRIu64 "\n", x->d1, x->d0, cy, rem); */
 	return (uint32)rem;
 }
 
@@ -5350,7 +5335,7 @@ double	convert_base10_char_double (const char*char_buf)
 
 			if(c == '.')	/* Found a decimal point */
 			{
-				ASSERT(HERE, curr_mul == 0.0,"curr_mul == 0.0");	/* Make sure this is the first . we've encountered */
+				ASSERT(curr_mul == 0.0,"curr_mul == 0.0");	/* Make sure this is the first . we've encountered */
 				curr_mul = 1.0;
 				continue;
 			}
@@ -5361,12 +5346,12 @@ double	convert_base10_char_double (const char*char_buf)
 			else
 			{
 				fprintf(stderr,"convert_base10_char_double: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
-				ASSERT(HERE, curr_mul == 0.0,"curr_mul == 0.0");
+				ASSERT(curr_mul == 0.0,"curr_mul == 0.0");
 			}
 		}
 		curr_mul *= 0.1;	/* Only has an effect if we're to the right of the DP */
 		curr_digit = (uint64)(c - CHAROFFSET);
-		ASSERT(HERE, curr_digit < 10,"convert_base10_char_double: curr_digit < 10");
+		ASSERT(curr_digit < 10,"convert_base10_char_double: curr_digit < 10");
 		/* Store 10*currsum in a 128-bit product, so can check for overflow: */
 	#ifdef MUL_LOHI64_SUBROUTINE
 		MUL_LOHI64((uint64)10,curr_sum,&curr_sum,&hi);
@@ -5376,7 +5361,7 @@ double	convert_base10_char_double (const char*char_buf)
 		if(hi != 0)
 		{
 			fprintf(stderr, "ERROR: Mul-by-10 overflows in convert_base10_char_double: Offending input string = %s\n", char_buf);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 		curr_sum += curr_digit;	/* Since currsum now a multiple of 10, adding a single digit at the low end can't overflow */
 	}
@@ -5385,7 +5370,7 @@ double	convert_base10_char_double (const char*char_buf)
 	and return that; otherwise we return (double)curr_sum*curr_mul .
 	*/
 #if 0
-	printf("convert_base10_char_double: char_buf = %s, curr_sum = %llu, curr_mul = %lf\n",char_buf, curr_sum, curr_mul);
+	printf("convert_base10_char_double: char_buf = %s, curr_sum = %" PRIu64 ", curr_mul = %lf\n",char_buf, curr_sum, curr_mul);
 #endif
 	if(curr_mul == 0.0)
 	{
@@ -5435,11 +5420,11 @@ uint64 convert_base10_char_uint64 (const char*char_buf)
 			else
 			{
 				fprintf(stderr,"convert_base10_char_uint64: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 		curr_digit = (uint64)(c - CHAROFFSET);
-		ASSERT(HERE, curr_digit < 10,"convert_base10_char_uint64: curr_digit < 10");
+		ASSERT(curr_digit < 10,"convert_base10_char_uint64: curr_digit < 10");
 		/* Store 10*currsum in a 128-bit product, so can check for overflow: */
 	#ifdef MUL_LOHI64_SUBROUTINE
 		MUL_LOHI64((uint64)10,curr_sum,&curr_sum,&hi);
@@ -5449,7 +5434,7 @@ uint64 convert_base10_char_uint64 (const char*char_buf)
 		if(hi != 0)
 		{
 			fprintf(stderr, "ERROR: Mul-by-10 overflows in convert_base10_char_uint64: Offending input string = %s\n", char_buf);
-			ASSERT(HERE, 0,"0");
+			ASSERT(0,"0");
 		}
 		curr_sum += curr_digit;	/* Since currsum now a multiple of 10, adding a single digit at the low end can't overflow */
 	}
@@ -5503,11 +5488,11 @@ uint128	convert_base10_char_uint128(const char*char_buf)
 			else
 			{
 				fprintf(stderr,"convert_base10_char_uint128: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 		curr_digit = (uint64)(c - CHAROFFSET);
-		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
+		ASSERT(curr_digit < 10,"util.c: curr_digit < 10");
 		/* currsum *= 10, and check for overflow: */
 		tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len);
 		if(tmp != 0)
@@ -5515,13 +5500,13 @@ uint128	convert_base10_char_uint128(const char*char_buf)
 			if(len == LEN_MAX)
 			{
 				fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT128: Offending input string = %s\n", char_buf);
-				ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
+				ASSERT(len <= LEN_MAX,"len <= LEN_MAX");
 			}
 			curr_sum[len++] = tmp;
 		}
 
 		len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len);
-		ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
+		ASSERT(len <= LEN_MAX,"len <= LEN_MAX");
 	}
 
 	x128.d0 = curr_sum[0];
@@ -5566,11 +5551,11 @@ uint192	convert_base10_char_uint192(const char*char_buf)
 			else
 			{
 				fprintf(stderr,"convert_base10_char_uint192: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 		curr_digit = (uint64)(c - CHAROFFSET);
-		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
+		ASSERT(curr_digit < 10,"util.c: curr_digit < 10");
 		/* currsum *= 10, and check for overflow: */
 		tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len);
 		if(tmp != 0)
@@ -5578,13 +5563,13 @@ uint192	convert_base10_char_uint192(const char*char_buf)
 			if(len == LEN_MAX)
 			{
 				fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT192: Offending input string = %s\n", char_buf);
-				ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
+				ASSERT(len <= LEN_MAX,"len <= LEN_MAX");
 			}
 			curr_sum[len++] = tmp;
 		}
 
 		len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len);
-		ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
+		ASSERT(len <= LEN_MAX,"len <= LEN_MAX");
 	}
 
 	x192.d0 = curr_sum[0];
@@ -5630,11 +5615,11 @@ uint256	convert_base10_char_uint256(const char*char_buf)
 			else
 			{
 				fprintf(stderr,"convert_base10_char_uint256: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c);
-				ASSERT(HERE, 0,"0");
+				ASSERT(0,"0");
 			}
 		}
 		curr_digit = (uint64)(c - CHAROFFSET);
-		ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10");
+		ASSERT(curr_digit < 10,"util.c: curr_digit < 10");
 		/* currsum *= 10, and check for overflow: */
 		tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len);
 		if(tmp != 0)
@@ -5642,13 +5627,13 @@ uint256	convert_base10_char_uint256(const char*char_buf)
 			if(len == LEN_MAX)
 			{
 				fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT256: Offending input string = %s\n", char_buf);
-				ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
+				ASSERT(len <= LEN_MAX,"len <= LEN_MAX");
 			}
 			curr_sum[len++] = tmp;
 		}
 
 		len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len);
-		ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX");
+		ASSERT(len <= LEN_MAX,"len <= LEN_MAX");
 	}
 
 	x256.d0 = curr_sum[0];
@@ -5730,7 +5715,7 @@ double	finvest(double x, uint32 numbits)
 	exp  = (itmp >> 52) & MASK_EXP;
 	mant =  itmp        & MASK_MANT;
 	/* Make sure number is normalized: */
-	ASSERT(HERE, exp != 0,"finvest: denormalized inputs illegal!");
+	ASSERT(exp != 0,"finvest: denormalized inputs illegal!");
 
 	/* Store most-significant 8 non-hidden bits: */
 	byteval = (mant >> 44) & 0x000000ff;
@@ -5770,7 +5755,7 @@ ftmp0 = ftmp;
 	if(fabs(err_num)/fabs(err_den) >= 2e-3)
 	{
 		sprintf(cbuf, "finvtest: ftmp0 too inaccurate! ftmp = %e, ftmp0 = %e, relerr = %e\n", ftmp, ftmp0,fabs(err_num)/fabs(err_den));
-		ASSERT(HERE, 0, cbuf);
+		ASSERT(0, cbuf);
 	}
 
 	return ftmp;
@@ -5799,7 +5784,7 @@ double	fisqrtest(double x, uint32 numbits)
 	exp  = (itmp >> 52) & MASK_EXP;
 	mant =  itmp        & MASK_MANT;
 	/* Make sure number is normalized: */
-	ASSERT(HERE, exp != 0,"finvest: denormalized inputs illegal!");
+	ASSERT(exp != 0,"finvest: denormalized inputs illegal!");
 
 	/* Store most-significant 9 non-hidden bits - we'll use either all
 	or the high 8 of these, depending on the parity of the exponent: */
@@ -5873,7 +5858,7 @@ ftmp0 = ftmp;
 	if(fabs(err_num)/fabs(err_den) >= 2e-3)
 	{
 		sprintf(cbuf, "fisqrtest: ftmp0 too inaccurate! ftmp = %e, ftmp0 = %e, relerr = %e\n", ftmp, ftmp0,fabs(err_num)/fabs(err_den));
-		ASSERT(HERE, 0, cbuf);
+		ASSERT(0, cbuf);
 	}
 
 	return ftmp;
@@ -5932,7 +5917,7 @@ ftmp0 = ftmp;
   #ifdef USE_AVX1024
 	int	test_simd_transpose_16x16()
 	{
-		ASSERT(HERE,0,"function not yet supported!");
+		ASSERT(0,"function not yet supported!");
 		return 0;
 	}
   #endif
@@ -5947,7 +5932,7 @@ ftmp0 = ftmp;
 		const int dim = 64;	// #elements in our matrix, allocate 2x this to allow for real/imag side-by-side variant
 		vec_dbl *mem = 0x0, *data;
 		mem = ALLOC_VEC_DBL(mem, 2*dim+4);	// Add 4 pads to allow for alignment on up-to-128-byte boundary
-		data = ALIGN_VEC_DBL(mem);	ASSERT(HERE, ((long)data & 0x1f) == 0, "data not 32-byte aligned!");
+		data = ALIGN_VEC_DBL(mem);	ASSERT(((long)data & 0x1f) == 0, "data not 32-byte aligned!");
 		// Init the matrix -  Input matrix has rows containing [0-7][8-15]...[56-63]:
 		double *dptr = (double *)data;
 		for(i = 0; i < dim; i++) { *(dptr+i) = i; }
@@ -7092,7 +7077,7 @@ ftmp0 = ftmp;
 		const int dim = 16;		// #elements in our matrix
 		vec_dbl *mem  = ALLOC_VEC_DBL(mem, dim+4);	// Add 4 pads to allow for alignment on up-to-128-byte boundary
 		vec_dbl *data = ALIGN_VEC_DBL(mem);
-		ASSERT(HERE, ((long)data & 0x1f) == 0, "data not 32-byte aligned!");
+		ASSERT(((long)data & 0x1f) == 0, "data not 32-byte aligned!");
 		// Init the matrix -  Input matrix has rows:
 		double *dptr = (double *)data;	//  0, 1, 2, 3
 		for(i = 0; i < dim; i++) {		//  4, 5, 6, 7
@@ -7251,9 +7236,9 @@ ftmp0 = ftmp;
 								13.,15.,31.,16.,-17.,27.,45.,28.,6.,-25.,-24.,15.,-6.,-1.,48.,-57.};
 		vec_dbl *c_tmp,*s_tmp, *cc0,*two, *r0,*r1,*r2,*r3;
 		// Alloc 8 vector-complex elts (16 vec_dbl) per input/output block rather than 4, so can also test two radix-4 DFTs done side-by-side:
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x42);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x42);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
 		add0 = sc_ptr;
 		add1 = sc_ptr+0x2;
 		add2 = sc_ptr+0x4;
@@ -7281,7 +7266,7 @@ ftmp0 = ftmp;
 			VEC_DBL_INIT(c_tmp+6, ran[i+8]);	VEC_DBL_INIT(s_tmp+6, ran[i+9]);
 /*
 			// Restructure twiddle-muls to use cotangent-scheme:
-			ASSERT(HERE, ran[i+1] != 0.0 && ran[i+9] != 0.0,"Need to modify test-twiddles to avoid div-by-0!");
+			ASSERT(ran[i+1] != 0.0 && ran[i+9] != 0.0,"Need to modify test-twiddles to avoid div-by-0!");
 			VEC_DBL_INIT(c_tmp  , ran[i  ]/(double)ran[i+1]);	VEC_DBL_INIT(s_tmp  , ran[i+1]);
 			VEC_DBL_INIT(c_tmp+8, ran[i+8]/(double)ran[i+9]);	VEC_DBL_INIT(s_tmp+8, ran[i+9]);
 */
@@ -7502,7 +7487,7 @@ ftmp0 = ftmp;
 			nerr += (fabs(*(dptr+3) - ref1[j+3]) > 1e-10);
 			dptr += 4;
 		}
-		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "Outputs mismatch ref-data!");
 
 	// Timing loop #2 - two radix-4 DFTs (operating on separate data chunks but sharing twiddles) side-by-side:
 		/* 6 May 2016, Core2:
@@ -7542,7 +7527,7 @@ ftmp0 = ftmp;
 			nerr += (*dptr != ref1[j]) + (*(dptr+1) != ref1[j+1]) + (*(dptr+2) != ref1[j+2]) + (*(dptr+3) != ref1[j+3]);
 			dptr += 4;
 		}
-		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "Outputs mismatch ref-data!");
 
 	// Timing loop #3 - single radix-4 DIT DFT:
 		dim = 8*RE_IM_STRIDE;	// 4 vector-complex data
@@ -7565,7 +7550,7 @@ ftmp0 = ftmp;
 			nerr += (*dptr != ref2[j]) + (*(dptr+1) != ref2[j+1]) + (*(dptr+2) != ref2[j+2]) + (*(dptr+3) != ref2[j+3]);
 			dptr += 4;
 		}
-		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "Outputs mismatch ref-data!");
 
 	// Timing loop #4 - two radix-4 DIT DFTs (operating on separate data chunks but sharing twiddles) side-by-side:
 		for(j = 0; j < dim+dim; j++) { *(add0+j) = ran[j]; }
@@ -7588,7 +7573,7 @@ ftmp0 = ftmp;
 			nerr += (*dptr != ref2[j]) + (*(dptr+1) != ref2[j+1]) + (*(dptr+2) != ref2[j+2]) + (*(dptr+3) != ref2[j+3]);
 			dptr += 4;
 		}
-		ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "Outputs mismatch ref-data!");
 
 		free((void *)sc_arr);	sc_arr=0x0;
 		return nerr;
@@ -7655,9 +7640,9 @@ ftmp0 = ftmp;
 		const int stride = 2*RE_IM_STRIDE, dim = stride<<4;
 		double c1,c2,c3,c4,c5,c6,c7,c8,c9,cA,cB,cC,cD,cE,cF, s1,s2,s3,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF;
 		static double *a,*a_ptr;	// Dimension = number of scalar-doubles in 16 vector-complex in SIMD build mode
-		a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE);	if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE);	if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		a     = ALIGN_VEC_DBL(a_ptr);
-		ASSERT(HERE, ((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!");
+		ASSERT(((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!");
 	#ifdef USE_SSE2
 		const int pfetch_dist = 0;
 		int pfetch_addr = 0;	// Don't care about pfetch in this lcal-mem context, so just set these = 0
@@ -7665,9 +7650,9 @@ ftmp0 = ftmp;
 		double *add0,*add1,*add2;	/* Addresses into array sections */
 		vec_dbl *c_tmp,*s_tmp, *i0,*i1,*i2,*i3, *o0,*o1,*o2,*o3;
 		static vec_dbl *cc0, *ss0, *isrt2, *two, *r00;
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 72);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 72);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!");
 		r00 = sc_ptr + 0x00;	  isrt2 = sc_ptr + 0x20;
 									cc0 = sc_ptr + 0x21;
 									ss0 = sc_ptr + 0x22;
@@ -8152,7 +8137,7 @@ exit(0);
 		#endif
 		}
 printf("DIF: nerr = %u, ",nerr);
-		ASSERT(HERE, nerr == 0, "DIF Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "DIF Outputs mismatch ref-data!");
 		printf("\tSummed roundoff error = %20.10e]\n",avg_err);
 
 		//******************* Timing loop for Radix-16 DIT transform macro: *******************
@@ -8465,7 +8450,7 @@ exit(0);
 			dtmp = fabs(a[j1+1] - ref2[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; };
 		#endif
 		}
-		ASSERT(HERE, nerr == 0, "DIT Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "DIT Outputs mismatch ref-data!");
 		printf("\tSummed roundoff error = %20.10e]\n",avg_err);
 
 	#ifdef USE_SSE2
@@ -8563,9 +8548,9 @@ exit(0);
 		const int stride = 2*RE_IM_STRIDE, dim = stride<<5, idx[32] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62};
 		double cc[32],ss[32];
 		static double *a,*a_ptr;	// Dimension = number of scalar-doubles in 16 vector-complex in SIMD build mode
-		a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE);	if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE);	if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		a     = ALIGN_VEC_DBL(a_ptr);
-		ASSERT(HERE, ((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!");
+		ASSERT(((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!");
 	#ifdef USE_SSE2
 		const int pfetch_dist = 0;
 		int pfetch_addr = 0;	// Don't care about pfetch in this lcal-mem context, so just set these = 0
@@ -8573,9 +8558,9 @@ exit(0);
 		double *add0;	/* Addresses into array sections */
 		vec_dbl *c_tmp,*s_tmp;
 		static vec_dbl *isrt2,*sqrt2, *cc0, *ss0, *cc1, *ss1, *cc3, *ss3, *one,*two, *r00,*r10,*r20,*r30;
-		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(HERE, 0,cbuf); }
+		sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90);	if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf);	ASSERT(0,cbuf); }
 		sc_ptr = ALIGN_VEC_DBL(sc_arr);
-		ASSERT(HERE, ((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!");
+		ASSERT(((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!");
 		r00 = sc_ptr;
 		r10 = r00 + 0x10;
 		r20 = r00 + 0x20;
@@ -8756,7 +8741,7 @@ exit(0);
 			nerr += (fabs(a[j1+1] - ref1[j2+1]) > 1e-10);
 		#endif
 		}
-		ASSERT(HERE, nerr == 0, "DIF Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "DIF Outputs mismatch ref-data!");
 		printf("\tSummed roundoff error = %20.10e]\n",avg_err);
 	#if 0
 		10^6-timing:	setup	+=DIF	DIF-only
@@ -8877,7 +8862,7 @@ exit(0);
 			nerr += (fabs(a[j1+1] - ref2[j2+1]) > 1e-10);
 		#endif
 		}
-		ASSERT(HERE, nerr == 0, "DIT Outputs mismatch ref-data!");
+		ASSERT(nerr == 0, "DIT Outputs mismatch ref-data!");
 		printf("\tSummed roundoff error = %20.10e]\n",avg_err);
 	#if 0
 		10^6-timing:	setup	+=DIF	DIF-only
@@ -8993,7 +8978,7 @@ exit(0);
 		int ncpu = get_num_cores(), nshift, nextra;
 		printf("Mlucas running as system-created pthread %u, threading self-test will use %d user-created pthreads.\n", (int)pth, nthreads);
 		if(verbose) {
-			ASSERT(HERE, nthreads > 0,"Mlucas.c: nthreads > 0");
+			ASSERT(nthreads > 0,"Mlucas.c: nthreads > 0");
 			if(nthreads > ncpu) {
 				printf("WARN: Test using more threads[%d] than there are available CPUs[%d].\n", nthreads, ncpu);
 			}
@@ -9088,7 +9073,7 @@ exit(0);
 
 		// 10 sequential iters of test loop yield successive values -1452071552,1390824192,-61247360,-1513318912,1329576832,
 		// -122494720,-1574566272,1268329472,-1837420,-1635813632:
-		ASSERT(HERE, isum == -1635813632, "retval error!");
+		ASSERT(isum == -1635813632, "retval error!");
 		return 0;
 	}
 
@@ -9123,7 +9108,7 @@ exit(0);
 		int i;                      /* counter, to print numbers */
 		int j;                      /* counter, for delay        */
 		int k = 0;	/* accumulator to keep gcc from otimizing away delay-multiply inside test loop */
-		ASSERT(HERE, thread_arg != 0x0, "do_loop test function for pthread-test needs live thread_arg pointer!");
+		ASSERT(thread_arg != 0x0, "do_loop test function for pthread-test needs live thread_arg pointer!");
 
 	  #if 0	// BSD thread affinity API barfs in my Mac builds
 		cpuset_t *cset;
@@ -9132,7 +9117,7 @@ exit(0);
 
 		cset = cpuset_create();
 		if (cset == NULL) {
-			ASSERT(HERE, 0, "cpuset_create");
+			ASSERT(0, "cpuset_create");
 		}
 		ci = 0;
 		cpuset_set(ci, cset);
@@ -9140,7 +9125,7 @@ exit(0);
 		pth = pthread_self();
 		error = pthread_setaffinity_np(pth, cpuset_size(cset), cset);
 		if (error) {
-			ASSERT(HERE, 0, "pthread_setaffinity_np");
+			ASSERT(0, "pthread_setaffinity_np");
 		}
 		cpuset_destroy(cset);
 	  #endif
@@ -9172,20 +9157,20 @@ exit(0);
 		nobjs1 = hwloc_get_nbobjs_by_type (topology, HWLOC_OBJ_CORE);
 		nobjs2 = hwloc_get_nbobjs_by_depth(topology, depth);
 		if(nobjs1 != nobjs2) {
-			snprintf(cbuf,STR_MAX_LEN,"#objects of type CORE (%d) mismatches #objects (%d) at depth %d (topo depth = %d).",nobjs1,nobjs2,depth,topodepth);
-			ASSERT(HERE,0,cbuf);
+			snprintf(cbuf,STR_MAX_LEN*2,"#objects of type CORE (%d) mismatches #objects (%d) at depth %d (topo depth = %d).",nobjs1,nobjs2,depth,topodepth);
+			ASSERT(0,cbuf);
 		}
 		// Loop over HWLOC_OBJ_CORE objects corr. to index range:
 		for (i = lidx_lo; i <= lidx_hi; i++) {
 			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i);
 			if (!obj) {
-				snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i);	ASSERT(HERE,0,cbuf);
+				snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i);	ASSERT(0,cbuf);
 			}
-			ASSERT(HERE, obj->type == HWLOC_OBJ_CORE, "[hwloc] Error: Object not of expected type CORE.");
+			ASSERT(obj->type == HWLOC_OBJ_CORE, "[hwloc] Error: Object not of expected type CORE.");
 			while(obj && (obj->type != HWLOC_OBJ_PACKAGE)) {
 				obj = obj->parent;
 			}
-			ASSERT(HERE, obj != 0, "[hwloc] Error: PACKAGE Object not found.");
+			ASSERT(obj != 0, "[hwloc] Error: PACKAGE Object not found.");
 			if(obj->logical_index != socket_idx) {
 				nsockets++;
 				socket_idx = obj->logical_index;
@@ -9202,22 +9187,22 @@ exit(0);
 	{
 		int ncpu = 0, lo = -1,hi = lo,incr = 1, i,j,bit,word;
 		char *char_addr = istr, *endp;
-		ASSERT(HERE, char_addr != 0x0, "Null input-string pointer!");
+		ASSERT(char_addr != 0x0, "Null input-string pointer!");
 		size_t len = strlen(istr);
 		if(len == 0) return 0;	// Allow 0-length input, resulting in no-op
-		ASSERT(HERE, len <= STR_MAX_LEN, "Excessive input-substring length!");
-		lo = strtoul(char_addr, &endp, 10);	ASSERT(HERE, lo >= 0, "lo-substring not a valid nonnegative number!");
+		ASSERT(len <= STR_MAX_LEN, "Excessive input-substring length!");
+		lo = strtoul(char_addr, &endp, 10);	ASSERT(lo >= 0, "lo-substring not a valid nonnegative number!");
 		if(*endp) {
-			ASSERT(HERE, *endp == ':', "Non-colon separator in core-affinity-triplet substring!");
+			ASSERT(*endp == ':', "Non-colon separator in core-affinity-triplet substring!");
 			char_addr = endp+1;
 			hi = strtoul(char_addr, &endp, 10);
-			ASSERT(HERE, hi >= lo, "hi-substring not a valid number >= lo!");
+			ASSERT(hi >= lo, "hi-substring not a valid number >= lo!");
 			if(*endp) {
-				ASSERT(HERE, *endp == ':', "Non-colon separator in core-affinity-triplet substring!");
+				ASSERT(*endp == ':', "Non-colon separator in core-affinity-triplet substring!");
 				char_addr = endp+1;
 				incr = strtoul(char_addr, &endp, 10);
-				ASSERT(HERE, incr > 0, "incr-substring not a valid positive number!");
-				ASSERT(HERE, *endp == 0x0, "Non-numeric increment substring in core-affinity-triplet substring!");
+				ASSERT(incr > 0, "incr-substring not a valid positive number!");
+				ASSERT(*endp == 0x0, "Non-numeric increment substring in core-affinity-triplet substring!");
 			} else {
 				// If increment (third) argument of triplet omitted, default to incr = 1.
 			}
@@ -9241,7 +9226,7 @@ exit(0);
 				hwloc_obj_t obj_core, obj_pu;
 				obj_core = hwloc_get_obj_by_type(hw_topology, HWLOC_OBJ_CORE, i);
 				if (!obj_core) {
-					snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i);	ASSERT(HERE,0,cbuf);
+					snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i);	ASSERT(0,cbuf);
 				}
 				// 2. for each HWLOC_OBJ_CORE object in the above set, verify that it has at least (n) children
 				/*
@@ -9252,14 +9237,14 @@ exit(0);
 					'-cpu 0:11', or even more simply '-nthread 12') to use all 12 threads.
 				*/
 				if (obj_core->arity < incr) {
-					snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: Requested threads_per_core (%u) exceeds arity (%u) of HWLOC_OBJ_CORE[%u].\n",incr,obj_core->arity,i);	ASSERT(HERE,0,cbuf);
+					snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: Requested threads_per_core (%u) exceeds arity (%u) of HWLOC_OBJ_CORE[%u].\n",incr,obj_core->arity,i);	ASSERT(0,cbuf);
 				}
 				for (j = 0; j < incr; j++) {
 					obj_pu = obj_core->children[j];
 					// Set bit = (obj_pu->logical_index) in CORE_SET bitmap, used in thread-affinity setting:
 					bit = obj_pu->logical_index;
 					if(mi64_test_bit(CORE_SET, bit)) {
-						sprintf(cbuf, "HWLOC_OBJ_PU %d multiply specified in affinity-setting!",bit);	ASSERT(HERE, 0, cbuf);
+						sprintf(cbuf, "HWLOC_OBJ_PU %d multiply specified in affinity-setting!",bit);	ASSERT(0, cbuf);
 					} else {
 						mi64_set_bit(CORE_SET, bit, MAX_CORES>>6, 1);
 					#if INCLUDE_HWLOC==2
@@ -9273,8 +9258,8 @@ exit(0);
 			// CPU set encoded by integer-triplet argument corresponds to values of integer loop
 			// index i in the C-loop for(i = lo; i < hi; i += incr), excluding loop-exit value of i:
 			for(i = lo; i <= hi; i += incr, ncpu++) {
-				word = i>>6; bit = i & 63;	ASSERT(HERE, word < MAX_CORES, "Bitmap word exceeds MAX_CORES!");
-				if(CORE_SET[word] & (1ull<<bit)) { sprintf(cbuf, "Core %d multiply specified in affinity-setting!",i);	ASSERT(HERE, 0, cbuf); }
+				word = i>>6; bit = i & 63;	ASSERT(word < MAX_CORES, "Bitmap word exceeds MAX_CORES!");
+				if(CORE_SET[word] & (1ull<<bit)) { sprintf(cbuf, "Core %d multiply specified in affinity-setting!",i);	ASSERT(0, cbuf); }
 				else { CORE_SET[word] |= 1ull<<bit; }
 			}
 		}
@@ -9287,10 +9272,10 @@ exit(0);
 	{
 		uint32 ncpu = 0, i,bit,word,nc, core_count_oflow = 0;
 		char *char_addr = istr, *cptr;
-		ASSERT(HERE, char_addr != 0x0, "Null input-string pointer!");
+		ASSERT(char_addr != 0x0, "Null input-string pointer!");
 		size_t len = strlen(istr);	// length, not counting the \0 string terminator
-		ASSERT(HERE, len > 0, "Zero input-string length!");
-		ASSERT(HERE, len <= STR_MAX_LEN, "Excessive input-string length!");
+		ASSERT(len > 0, "Zero input-string length!");
+		ASSERT(len <= STR_MAX_LEN, "Excessive input-string length!");
 		// Clear existing core-affinity bitmap:
 		for(i = 0; i < MAX_CORES>>6; i++) { CORE_SET[i] = 0ull; }
 		// Affinity-triplet substrings are delimited by commas:
@@ -9310,7 +9295,7 @@ exit(0);
 			}
 		}
 		printf("\n");
-		ASSERT(HERE, nc == ncpu, "Bitmap #set-bits mismatches #cpu!");
+		ASSERT(nc == ncpu, "Bitmap #set-bits mismatches #cpu!");
 		NTHREADS = ncpu;
 		if(NTHREADS > MAX_THREADS) {	// Test this first, since if true, it implies truth of the 'else' conditional
 		//	fprintf(stderr,"WARN: NTHREADS = %d exceeds number of logical cores = %d ... Affinities for core indices > %d will be set (mod %d).\n",NTHREADS,MAX_THREADS,MAX_THREADS,MAX_THREADS);
@@ -9338,7 +9323,7 @@ double get_time(double tdiff)
 
 char*get_time_str(double tdiff)
 {
-	static char cbuf[STR_MAX_LEN];
+	static char cbuf[STR_MAX_LEN*2];
 #ifndef MULTITHREAD	// In || mode the mod_square routines use getRealTime() to accumulate wall-clock time, thus CLOCKS_PER_SEC not needed
 	tdiff /= CLOCKS_PER_SEC;	/* NB: CLOCKS_PER_SEC may be a phony value used to scale clock() ranges */
 #endif
@@ -9376,7 +9361,7 @@ char *MLUCAS_PATH = "";
 
    On sucess, set_mlucas_path() returns silently
    On error, set_mlucas_path() prints the cause of error to stderr
-   and calls ASSERT(HERE, 0, "Exiting.");
+   and calls ASSERT(0, "Exiting.");
 
    possible errors:
    unable to allocate buffer
@@ -9464,7 +9449,7 @@ void set_mlucas_path(void)
 	free(mlucas_path);
 	out_err_check:
 	if (has_err)
-		ASSERT(HERE, 0, "Exiting.");
+		ASSERT(0, "Exiting.");
 }
 
 /* Double-quote all spaces in the string pointed by src and write it to dest.
@@ -9530,7 +9515,7 @@ int mkdir_p(char *path)
 	fp = popen(cmdstr, "r");
 	if (fp == NULL) {
 		fprintf(stderr, "ERROR: unable to open pipe fp in mkdir_p()\n");
-		ASSERT(HERE, 0, "Exiting.");
+		ASSERT(0, "Exiting.");
 	}
 	fgets(tmp, STR_MAX_LEN + 1, fp);
 	pclose(fp);
@@ -9601,7 +9586,7 @@ FILE *mlucas_fopen(const char *path, const char *mode)
 */
 void mlucas_fprint(char*const cstr, uint32 echo_to_stderr)
 {
-	ASSERT(HERE, cstr != 0x0 && strlen(cstr) > 0,"Null string-pointer or empty string supplied to mlucas_fprint!");
+	ASSERT(cstr != 0x0 && strlen(cstr) > 0,"Null string-pointer or empty string supplied to mlucas_fprint!");
 	if(echo_to_stderr)
 		fprintf(stderr,"%s",cstr);
 	if(echo_to_stderr < 2) {
@@ -9621,7 +9606,7 @@ double mlucas_getOptVal(const char*fname, char*optname)
 {
 	const char func[] = "mlucas_getOptVal";
 	char cstr[STR_MAX_LEN], *cptr,*cadd;
-	ASSERT(HERE, fname != 0x0 && strlen(fname) > 0,"Null filename-pointer or empty string supplied to mlucas_getOptVal!");
+	ASSERT(fname != 0x0 && strlen(fname) > 0,"Null filename-pointer or empty string supplied to mlucas_getOptVal!");
 	FILE *fptr = mlucas_fopen(fname,"r");
 	double result = strtod("NaN", 0x0);
 	if(fptr) {
diff --git a/src/util.h b/src/util.h
index 5b4c8b68..620070ab 100755
--- a/src/util.h
+++ b/src/util.h
@@ -107,7 +107,7 @@ for(i = 0; i < 256; i++) {
 	for(j = bit ; j < 8; j++) {
 		x32 += 0xf<<(4*j);
 	}
-	printf("0x%8X,",x32);
+	printf("%#8X,",x32);
 }
 printf("\n");
 */
@@ -229,8 +229,12 @@ void	WARN	(long line, char*file, char*warn_string, char*warn_file, int copy2stde
 	__device__
 	void	ASSERT(long line, char*file, int expr, char*assert_string);
 #else
-	void	ASSERT	(long line, char*file, int expr, char*assert_string);
+	// void	ASSERT	(long line, char*file, int expr, char*assert_string);
+	void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string);
 #endif
+
+#define ASSERT(expr, assert_string) _ASSERT(#expr, __FILE__, __LINE__, __func__, (expr), assert_string)
+
 void	VAR_WARN(char *typelist, ...);
 
 void	byte_bitstr(const uint8  byte, char*ostr);