diff --git a/src/Mdata.h b/src/Mdata.h index 7c46ef3b..19e913e7 100755 --- a/src/Mdata.h +++ b/src/Mdata.h @@ -222,7 +222,7 @@ extern char PSTRING[STR_MAX_LEN]; /* Number being tested in string form, typical #endif extern const int hex_chars[16]; -extern char cbuf[STR_MAX_LEN], cstr[STR_MAX_LEN]; +extern char cbuf[STR_MAX_LEN*2], cstr[STR_MAX_LEN]; extern char in_line[STR_MAX_LEN]; extern char *char_addr; extern int char_offset; diff --git a/src/Mlucas.c b/src/Mlucas.c index ea0746c0..2969ef33 100644 --- a/src/Mlucas.c +++ b/src/Mlucas.c @@ -111,7 +111,7 @@ char PSTRING[STR_MAX_LEN]; // Modulus being used in string form, e.g. "M11091692 #endif const int hex_chars[16] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'}; -char cbuf[STR_MAX_LEN],cstr[STR_MAX_LEN]; +char cbuf[STR_MAX_LEN*2],cstr[STR_MAX_LEN]; char in_line[STR_MAX_LEN]; char *char_addr; @@ -481,15 +481,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov #ifdef USE_OMP // OpenMP not currently supported (attempting to build with this #define enabled barfs in // preprocessing via #error in platform.h), this is merely placeholder for possible future use: - ASSERT(HERE, MAX_THREADS = omp_get_num_procs(), "Illegal #Cores value stored in MAX_THREADS"); + ASSERT(MAX_THREADS = omp_get_num_procs(), "Illegal #Cores value stored in MAX_THREADS"); #elif(defined(USE_PTHREAD)) - ASSERT(HERE, MAX_THREADS = get_num_cores(), "Illegal #Cores value stored in MAX_THREADS"); + ASSERT(MAX_THREADS = get_num_cores(), "Illegal #Cores value stored in MAX_THREADS"); #else #error Unrecognized multithreading model! #endif // MAX_THREADS based on number of processing cores will most often be a power of 2, but don't assume that. - ASSERT(HERE, MAX_THREADS > 0,"MAX_THREADS must be > 0"); - ASSERT(HERE, MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h ."); + ASSERT(MAX_THREADS > 0,"MAX_THREADS must be > 0"); + ASSERT(MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h ."); if(!NTHREADS) { NTHREADS = 1; @@ -499,7 +499,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov parseAffinityString(cbuf); } else if(NTHREADS > MAX_CORES) { sprintf(cbuf,"ERROR: NTHREADS = %d exceeds the MAX_CORES setting in Mdata.h = %d\n", NTHREADS, MAX_CORES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } else { // In timing-test mode, allow #threads > #cores if(NTHREADS > MAX_THREADS) { fprintf(stderr,"WARN: NTHREADS = %d exceeds number of cores = %d\n", NTHREADS, MAX_THREADS); @@ -524,7 +524,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov else ITERS_BETWEEN_CHECKPOINTS = 10000; } else if(check_interval < 1000) { - ASSERT(HERE,0,"User-set value of check_interval must >= 1000."); + ASSERT(0,"User-set value of check_interval must >= 1000."); } else ITERS_BETWEEN_CHECKPOINTS = check_interval; @@ -532,7 +532,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov i = ITERS_BETWEEN_GCHECKS; j = ITERS_BETWEEN_GCHECK_UPDATES; - ASSERT(HERE, i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)"); + ASSERT(i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)"); // v19: If PRP test, make sure Gerbicz-checkproduct interval divides checkpoint-writing one. // If not true, merely warn here because user may be doing LL/DC/p-1 and not PRP-tests: k = ITERS_BETWEEN_CHECKPOINTS; @@ -542,8 +542,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // Alloc bitwise multiply-by-base array, needed to support P-1 factoring and PRP testing: if(!BASE_MULTIPLIER_BITS) { j = ((ITERS_BETWEEN_CHECKPOINTS+63) >> 6) + 1; // Add 1 pad element in case compiler does not 64-bit align - BASE_MULTIPLIER_BITS = ALLOC_UINT64(BASE_MULTIPLIER_BITS, j); if(!BASE_MULTIPLIER_BITS){ sprintf(cbuf, "ERROR: unable to allocate BASE_MULTIPLIER_BITS array in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - BASE_MULTIPLIER_BITS = ALIGN_UINT64(BASE_MULTIPLIER_BITS); ASSERT(HERE, ((intptr_t)BASE_MULTIPLIER_BITS & 63) == 0x0,"BASE_MULTIPLIER_BITS[] not aligned on 64-byte boundary!"); + BASE_MULTIPLIER_BITS = ALLOC_UINT64(BASE_MULTIPLIER_BITS, j); if(!BASE_MULTIPLIER_BITS){ sprintf(cbuf, "ERROR: unable to allocate BASE_MULTIPLIER_BITS array in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + BASE_MULTIPLIER_BITS = ALIGN_UINT64(BASE_MULTIPLIER_BITS); ASSERT(((intptr_t)BASE_MULTIPLIER_BITS & 63) == 0x0,"BASE_MULTIPLIER_BITS[] not aligned on 64-byte boundary!"); for(i = 0; i < j; i++) { BASE_MULTIPLIER_BITS[i] = 0ull; } // v20: Init = 0 here, in case we jump directly into p-1 stage 2 on restart } @@ -648,17 +648,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov char_addr += 3; // Check [k,b,n,c] portion of in_line: cptr = check_kbnc(char_addr, &p); - ASSERT(HERE, cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!"); + ASSERT(cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!"); // Next 2 entries in in_line are how-far-factored and "# of PRP tests that will be saved if P-1 is done and finds a factor": TF_BITS = 0xffffffff; tests_saved = 0.0; if((char_addr = strstr(cptr, ",")) != 0x0) { cptr++; // Only check if there's an appropriate TF_BITS entry in the input line TF_BITS = strtoul(++char_addr, &endp, 10); - ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found after TF_BITS field in assignment-specifying line!"); cptr++; + ASSERT((char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found after TF_BITS field in assignment-specifying line!"); cptr++; tests_saved = strtod(++char_addr, &endp); if(tests_saved < 0 || tests_saved > 2) { - sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved); ASSERT(HERE,0,cbuf); + sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved); ASSERT(0,cbuf); } // char_addr now points to leftmost char of tests_saved field, which we will overwrite with 0; // endp points to to-be-appended leftover portion @@ -671,14 +671,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // Create p-1 assignment, then edit original assignment line appropriately TEST_TYPE = TEST_TYPE_PM1; kblocks = get_default_fft_length(p); - ASSERT(HERE, pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!"); + ASSERT(pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!"); // Format the p-1 assignment into cbuf - use cptr here, as need to preserve value of char_addr: - cptr = strstr(in_line, "="); ASSERT(HERE,cptr != 0x0,"Malformed assignment!"); + cptr = strstr(in_line, "="); ASSERT(cptr != 0x0,"Malformed assignment!"); cptr++; while(isspace(*cptr)) { ++cptr; } // Skip any whitespace following the equals sign if(is_hex_string(cptr, 32)) { - strncpy(aid,cptr,32); sprintf(cbuf,"Pminus1=%s,1,2,%llu,-1,%u,%llu\n",aid,p,B1,B2); // If we get here, it's a M(p), not F(m) + strncpy(aid,cptr,32); sprintf(cbuf,"Pminus1=%s,1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",aid,p,B1,B2); // If we get here, it's a M(p), not F(m) } else - sprintf(cbuf,"Pminus1=1,2,%llu,-1,%u,%llu\n",p,B1,B2); + sprintf(cbuf,"Pminus1=1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",p,B1,B2); // Copy up to the final (tests_saved) char of the assignment into cstr and append tests_saved = 0; // A properly formatted tests_saved field is 1 char wide and begins at the current value of char_addr: i = char_addr - in_line; strncpy(cstr,in_line, i); cstr[i] = '0'; cstr[i+1] = '\0'; @@ -695,17 +695,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov TEST_TYPE = TEST_TYPE_PRP; } else { // PRP double-check: // NB: Hit a gcc compiler bug (which left i = 0 for e.g. char_addr = ", 3 ,...") using -O0 here ... clang compiled correctly, as did gcc -O1: - i = (int)strtol(char_addr+1, &cptr, 10); // PRP bases other than 3 allowed; see https://github.com/primesearch/Mlucas/issues/18 // ASSERT(HERE, i == 3,"PRP-test base must be 3!"); + i = (int)strtol(char_addr+1, &cptr, 10); // PRP bases other than 3 allowed; see https://github.com/primesearch/Mlucas/issues/18 // ASSERT(i == 3,"PRP-test base must be 3!"); PRP_BASE = i; - ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found in assignment-specifying line!"); - i = (int)strtol(char_addr+1, &cptr, 10); ASSERT(HERE, i == 1 || i == 5,"Only PRP-tests of type 1 (PRP-only) and type 5 (PRP and subsequent cofactor-PRP check) supported!"); + ASSERT((char_addr = strstr(cptr, ",")) != 0x0,"Expected ',' not found in assignment-specifying line!"); + i = (int)strtol(char_addr+1, &cptr, 10); ASSERT(i == 1 || i == 5,"Only PRP-tests of type 1 (PRP-only) and type 5 (PRP and subsequent cofactor-PRP check) supported!"); // Read in known prime-factors, if any supplied - resulting factors end up in KNOWN_FACTORS[]: if(*cptr == ',') //vv--- Pass in unused file-ptr fq here in case function emits any messages: nfac = extract_known_factors(p,cptr+1); // Use 0-or-not-ness of KNOWN_FACTORS[0] to differentiate between PRP-only and PRP-CF: if(KNOWN_FACTORS[0] != 0ull) { - ASSERT(HERE, i == 5,"Only PRP-CF tests of type 5 supported!"); - if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) ASSERT(HERE, PRP_BASE == 3, "PRP-CF test base for Fermat numbers must be 3!"); + ASSERT(i == 5,"Only PRP-CF tests of type 5 supported!"); + if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) ASSERT(PRP_BASE == 3, "PRP-CF test base for Fermat numbers must be 3!"); } } goto GET_EXPO; @@ -715,7 +715,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov char_addr += 6; /* Look for comma following the modulus keyword and position next-keyword search right after it: */ if(!STREQN(char_addr,",",1)) - ASSERT(HERE, 0,"Expected ',' not found in input following modulus type specifier!"); + ASSERT(0,"Expected ',' not found in input following modulus type specifier!"); else char_addr++; @@ -729,7 +729,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov char_addr += 8; /* Look for comma following the modulus keyword and position next-keyword search right after it: */ if(!STREQN(char_addr,",",1)) - ASSERT(HERE, 0,"Expected ',' not found in input following modulus type specifier!"); + ASSERT(0,"Expected ',' not found in input following modulus type specifier!"); else char_addr++; } @@ -771,10 +771,10 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov char_addr += 7; // Check [k,b,n,c] portion of in_line: cptr = check_kbnc(char_addr, &p); - ASSERT(HERE, cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!"); - ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); + ASSERT(cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!"); + ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); B1 = (uint32)strtoul (char_addr+1, &cptr, 10); - ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); + ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); /* The C11 standard re. strtoull: "On success the function returns the converted integer as unsigned long long int type and sets endPtr to point to the first character after the input number. On failure it returns 0 and sets endPtr to point to NULL. It handles integer overflows efficiently and return ULONG_LONG_MAX on overflow." @@ -782,12 +782,12 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov also set endPtr to point to the first character after the input, which leaves some ambiguity - what if the input was in fact == ULONG_LONG_MAX? We assume here that nobody will use a p-1 stage bound so large: */ - B2 = (uint64)strtoull(char_addr+1, &cptr, 10); ASSERT(HERE, B2 != -1ull, "strtoull() overflow detected."); + B2 = (uint64)strtoull(char_addr+1, &cptr, 10); ASSERT(B2 != -1ull, "strtoull() overflow detected."); // Remaining args optional, with the 2 numerics presumed in-order, e.g. we only look for ',B2_start' field if ',TF_BITS' was present: if((char_addr = strstr(cptr, ",")) != 0x0) { - TF_BITS = (int)strtoul(char_addr+1, &cptr, 10); ASSERT(HERE, TF_BITS < 100 ,"TF_BITS value read from assignment is out of range."); + TF_BITS = (int)strtoul(char_addr+1, &cptr, 10); ASSERT(TF_BITS < 100 ,"TF_BITS value read from assignment is out of range."); if((char_addr = strstr(cptr, ",")) != 0x0) { - B2_start = (uint64)strtoull(char_addr+1, &cptr, 10); ASSERT(HERE, B2_start != -1ull, "strtoull() overflow detected."); + B2_start = (uint64)strtoull(char_addr+1, &cptr, 10); ASSERT(B2_start != -1ull, "strtoull() overflow detected."); if(B2_start > B1) // It's a stage 2 continuation run s2_continuation = TRUE; // Read in known prime-factors, if any supplied - resulting factors end up in KNOWN_FACTORS[]: @@ -804,15 +804,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov char_addr += 7; // Check [k,b,n,c] portion of in_line: cptr = check_kbnc(char_addr, &p); - ASSERT(HERE, cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!"); - ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); + ASSERT(cptr != 0x0, "[k,b,n,c] portion of in_line fails to parse correctly!"); + ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); TF_BITS = (int)strtoul(char_addr+1, &cptr, 10); - ASSERT(HERE, (char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); + ASSERT((char_addr = strstr(cptr, ",")) != 0x0 ,"Expected ',' not found in assignment-specifying line!"); tests_saved = strtod(++char_addr, &endp); if(tests_saved < 0 || tests_saved > 2) { - sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved); ASSERT(HERE,0,cbuf); + sprintf(cbuf, "ERROR: the specified tests_saved field [%10.5f] should be in the range [0,2]!\n",tests_saved); ASSERT(0,cbuf); } - ASSERT(HERE, pm1_set_bounds(p, get_default_fft_length(p)<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!"); + ASSERT(pm1_set_bounds(p, get_default_fft_length(p)<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!"); } #if INCLUDE_ECM else if(strstr(char_addr, "ECM")) @@ -823,13 +823,13 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov #endif else { - snprintf_nowarn(cbuf,STR_MAX_LEN,"WARN: Unrecognized/Unsupported option or empty assignment line. The ini file entry was %s\n",in_line); + snprintf(cbuf,STR_MAX_LEN*2,"WARN: Unrecognized/Unsupported option or empty assignment line. The ini file entry was %s\n",in_line); fprintf(stderr,"%s",cbuf); goto read_next_assignment; } if(!p) { // For legacy assignment types, set p here - ASSERT(HERE, (char_addr = strstr(char_addr, "=")) != 0x0,"Expected '=' not found in assignment-specifying line!"); + ASSERT((char_addr = strstr(char_addr, "=")) != 0x0,"Expected '=' not found in assignment-specifying line!"); char_addr++; /* Skip any whitespace following the equals sign:*/ while(isspace(*char_addr)) { ++char_addr; } @@ -839,19 +839,19 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov else if(STREQN_NOCASE(char_addr,"n/a",3)) char_addr = strstr(char_addr, ",") + 1; - p = strtoull(char_addr, &cptr, 10); ASSERT(HERE, p != -1ull, "strtoull() overflow detected."); + p = strtoull(char_addr, &cptr, 10); ASSERT(p != -1ull, "strtoull() overflow detected."); } GET_EXPO: // Need to init this for savefile-naming code - ASSERT(HERE, p != 0ull, "Exponent has not been set!"); - sprintf(ESTRING,"%llu",p); + ASSERT(p != 0ull, "Exponent has not been set!"); + sprintf(ESTRING,"%" PRIu64,p); // In PRP-test case, have already read the exponent from the worktodo line /* Special case of user forcing a non-default FFT length for an exponent in the worktodo file: */ if(exponent && (p != exponent)) { // || (MODULUS_TYPE != MODULUS_TYPE_MERSENNE)) 15. Oct 2012: Need same flexibility for Fermat numbers (e.g. F27 @ 7168k) as for Mersennes, so disable modulus-type part of conditional sprintf(cbuf,"User-supplied exponent and FFT-length for full-length test requires an exponent-matching 'Test=' or 'DoubleCheck=' %s entry!",WORKFILE); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* Check #bits in the Mersenne exponent vs. the allowed maximum: */ @@ -864,14 +864,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(findex <= MAX_PRIMALITY_TEST_BITS) p = (uint64)1 << findex; else - ASSERT(HERE, 0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS"); + ASSERT(0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS"); // For purposes of the bits-in-p limit, treat 2^findex as having (findex) rather than (findex+1) bits: nbits_in_p = findex; } else - ASSERT(HERE, 0,"MODULUS_TYPE unknown!"); + ASSERT(0,"MODULUS_TYPE unknown!"); - ASSERT(HERE, nbits_in_p <= MAX_EXPO_BITS,"Require nbits_in_p <= MAX_EXPO_BITS"); + ASSERT(nbits_in_p <= MAX_EXPO_BITS,"Require nbits_in_p <= MAX_EXPO_BITS"); #if INCLUDE_TF @@ -889,7 +889,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov /* For now, always start at k = 1: */ log2_min_factor = 0.0; log2_max_factor = get_default_factoring_depth(p); - ASSERT(HERE, log2_max_factor <= MAX_FACT_BITS, "log2_max_factor > MAX_FACT_BITS!"); + ASSERT(log2_max_factor <= MAX_FACT_BITS, "log2_max_factor > MAX_FACT_BITS!"); /* Field following the exponent is the already-factored-to depth: if none found, use defaults. */ char_addr = strstr(char_addr, ","); @@ -951,7 +951,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov TF_BITS = strtoul(char_addr, &endp, 10); #if INCLUDE_TF if(TF_BITS > MAX_FACT_BITS) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: TF_BITS of %u > max. allowed of %u. The ini file entry was %s\n", TF_BITS, MAX_FACT_BITS, in_line); + snprintf(cbuf,STR_MAX_LEN*2,"ERROR: TF_BITS of %u > max. allowed of %u. The ini file entry was %s\n", TF_BITS, MAX_FACT_BITS, in_line); fprintf(stderr,"%s",cbuf); goto GET_NEXT_ASSIGNMENT; } @@ -976,7 +976,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov pm1_done = strtoul(char_addr, &endp, 10); if(pm1_done > 1) { sprintf(cbuf, "ERROR: the specified pm1_done field [%u] should be 0 or 1!\n",pm1_done); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } if(!pm1_done) { // pm1_done == TRUE is a no-op, translating to "proceed with primality test" // Don't actually use this in pm1_set_bounds(), due to the rise of the single-shot PRP-with-proof paradigm, but for form's sake: @@ -984,14 +984,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // Create p-1 assignment, then edit original assignment line appropriately TEST_TYPE = TEST_TYPE_PM1; kblocks = get_default_fft_length(p); - ASSERT(HERE, pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!"); + ASSERT(pm1_set_bounds(p, kblocks<<10, TF_BITS, tests_saved), "Failed to set p-1 bounds!"); // Format the p-1 assignment into cbuf: - char_addr = strstr(in_line, "="); ASSERT(HERE,char_addr != 0x0,"Malformed assignment!"); + char_addr = strstr(in_line, "="); ASSERT(char_addr != 0x0,"Malformed assignment!"); char_addr++; while(isspace(*char_addr)) { ++char_addr; } // Skip any whitespace following the equals sign if(is_hex_string(char_addr, 32)) { - strncpy(aid,char_addr,32); sprintf(cbuf,"Pminus1=%s,1,2,%llu,-1,%u,%llu\n",aid,p,B1,B2); // If we get here, it's a M(p), not F(m) + strncpy(aid,char_addr,32); sprintf(cbuf,"Pminus1=%s,1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",aid,p,B1,B2); // If we get here, it's a M(p), not F(m) } else - sprintf(cbuf,"Pminus1=1,2,%llu,-1,%u,%llu\n",p,B1,B2); + sprintf(cbuf,"Pminus1=1,2,%" PRIu64 ",-1,%u,%" PRIu64 "\n",p,B1,B2); // Copy all but the final (pm1_done) char of the assignment into cstr and append pm1_done = 1. If in_line ends with newline, first --j: j = strlen(in_line) - 1; j -= (in_line[j] == '\n'); @@ -1012,7 +1012,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov else if(exponent != 0) /* elseif((found WORKFILE) == FALSE) */ { p = exponent; - fprintf(stderr," %s file not found...using user-supplied command-line exponent p = %llu\n",WORKFILE,p); + fprintf(stderr," %s file not found...using user-supplied command-line exponent p = %" PRIu64 "\n",WORKFILE,p); /* This takes care of the number-to-char conversion and leading-whitespace-removal in one step - use PSTRING for temporary storage here: */ strcpy(ESTRING, &PSTRING[convert_uint64_base10_char(PSTRING, p)]); @@ -1023,7 +1023,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(findex <= MAX_PRIMALITY_TEST_BITS) p = (uint64)1 << findex; else - ASSERT(HERE, 0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS"); + ASSERT(0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS"); /* For purposes of the bits-in-p limit, treat 2^findex as having (findex) rather than (findex+1) bits: */ @@ -1033,15 +1033,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov INTERACT=TRUE; - ASSERT(HERE,TEST_TYPE,"TEST_TYPE not set!"); - ASSERT(HERE,TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!"); + ASSERT(TEST_TYPE,"TEST_TYPE not set!"); + ASSERT(TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!"); /* If nbits_in_p > MAX_PRIMALITY_TEST_BITS, it better be a TF run: */ if(TEST_TYPE == TEST_TYPE_TF) { #if INCLUDE_TF /* Currently TF only supported for Mersennes: */ - ASSERT(HERE, (MODULUS_TYPE == MODULUS_TYPE_MERSENNE), "Trial-factoring Currently only supported for Mersenne numbers"); + ASSERT((MODULUS_TYPE == MODULUS_TYPE_MERSENNE), "Trial-factoring Currently only supported for Mersenne numbers"); /* For now, always start at k = 1: */ log2_min_factor = 0.0; if(iterations) { @@ -1051,15 +1051,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov else log2_max_factor = get_default_factoring_depth(p); - ASSERT(HERE, log2_max_factor >= 0, "log2_max_factor must be positive!"); - ASSERT(HERE, log2_max_factor <= MAX_FACT_BITS, "log2_max_factor exceeds MAX_FACT_BITS!"); + ASSERT(log2_max_factor >= 0, "log2_max_factor must be positive!"); + ASSERT(log2_max_factor <= MAX_FACT_BITS, "log2_max_factor exceeds MAX_FACT_BITS!"); #else - ASSERT(HERE, 0, "Trial-factoring not supported for this build/platform."); + ASSERT(0, "Trial-factoring not supported for this build/platform."); #endif } else if(TEST_TYPE == TEST_TYPE_PM1) /* P-1 factoring attempt */ { - ASSERT(HERE, nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring."); + ASSERT(nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring."); pm1_check_bounds(); // Proper setting of timing_test_iters in this case needs us to compute the stage 1 prime-powers product: // Compute stage 1 prime-powers product, store in PM1_S1_PRODUCT and store #bits of same in PM1_S1_PROD_BITS: @@ -1075,8 +1075,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov else /* Primality or PRP test */ { /* fprintf(stderr, "P = %u, nbits_in_p = %d\n",p,nbits_in_p); */ - ASSERT(HERE, nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring."); - ASSERT(HERE,iterations != 0,"Timing test with User-supplied exponent requires number of iterations to be specified via the -iters flag!"); + ASSERT(nbits_in_p <= MAX_PRIMALITY_TEST_BITS, "Inputs this large only permitted for trial-factoring."); + ASSERT(iterations != 0,"Timing test with User-supplied exponent requires number of iterations to be specified via the -iters flag!"); if(iterations <= 0) { fprintf(stderr, " Specified %u self-test iterations : must be > 0.\n", iterations); return ERR_TESTITERS_OUTOFRANGE; @@ -1089,12 +1089,12 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } else { fprintf(stderr,"No %s file not found, nor user-supplied command-line exponent.\n",WORKFILE); print_help(); - ASSERT(HERE, 0, "Unsupported combination of command-line args. Note that if you are trying to\nrun a single-FFT-length self-test, you *must* explicitly specify the iteration\ncount, e.g. './Mlucas -fft 7168 <-iters [+int]> [-cpu ]'"); + ASSERT(0, "Unsupported combination of command-line args. Note that if you are trying to\nrun a single-FFT-length self-test, you *must* explicitly specify the iteration\ncount, e.g. './Mlucas -fft 7168 <-iters [+int]> [-cpu ]'"); } // endif(found WORKFILE?) // If production run (not self-test), echo assignment to per-exponent logfile: if(!INTERACT) { - snprintf_nowarn(cbuf,STR_MAX_LEN," %s entry: %s\n",WORKFILE,in_line); + snprintf(cbuf,STR_MAX_LEN*2," %s entry: %s\n",WORKFILE,in_line); mlucas_fprint(cbuf,0); } @@ -1112,8 +1112,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov strcpy(STATFILE, RESTARTFILE); strcat(STATFILE, ".stat"); /*fprintf(stderr, "STATFILE = %s\n",STATFILE); */ - ASSERT(HERE,TEST_TYPE,"TEST_TYPE not set!"); - ASSERT(HERE,TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!"); + ASSERT(TEST_TYPE,"TEST_TYPE not set!"); + ASSERT(TEST_TYPE <= TEST_TYPE_MAX,"TEST_TYPE out of range!"); /* Fom this point onward the first character of restart filenames is context-dependent: */ #if INCLUDE_TF @@ -1124,7 +1124,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(!ASSIGNMENT_TYPE_MATRIX[MODULUS_TYPE][TEST_TYPE_TF]) { sprintf(cbuf, "TEST_TYPE_TF with MODULUS_TYPE = %u not supported!\n", MODULUS_TYPE); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } factor(ESTRING, log2_min_factor, log2_max_factor); @@ -1133,17 +1133,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov #endif if(TEST_TYPE > TEST_TYPE_MAX) { - ASSERT(HERE, 0,"ERROR: Unrecognized assignment type in savefile processing.\n"); + ASSERT(0,"ERROR: Unrecognized assignment type in savefile processing.\n"); } /* endif(TEST_TYPE == ...) */ /********************* P-1, primality, or PRP Test: ***********************************************/ if(p < PMIN) { - fprintf(stderr, " p must be at least %llu.\n",PMIN); + fprintf(stderr, " p must be at least %" PRIu64 ".\n",PMIN); return ERR_EXPONENT_ILLEGAL; } else if(p > PMAX) { - fprintf(stderr, " p must be no greater than %llu.\n",PMAX); + fprintf(stderr, " p must be no greater than %" PRIu64 ".\n",PMAX); return ERR_EXPONENT_ILLEGAL; } @@ -1173,7 +1173,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } TRANSFORM_TYPE = REAL_WRAPPER; - snprintf_nowarn(PSTRING,STR_MAX_LEN, "M%s", ESTRING); + snprintf(PSTRING,STR_MAX_LEN, "M%s", ESTRING); /* v19: Unlike standard mod-M(p) Fermat-PRP test, x0^(N-1) ?== 1 (mod N) which for N = M(p) gives N-1 = 2^p-2 = 0b111[p-1 binary 1s]1110 and thus requires [p-2 (x := x^2*base) steps followed by 1 final squaring], the @@ -1192,22 +1192,22 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov pm1_check_bounds(); s1p_alloc = compute_pm1_s1_product(p); maxiter = PM1_S1_PROD_BITS; // NOTE: In this case we don't want to override the PRP_BASE = 3 value set in compute_pm1_s1_product() - ASSERT(HERE, B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!"); + ASSERT(B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!"); RES_SHIFT = 0ull; // Must set = 0 here to make sure BASE_MULTIPLIER_BITS array gets set = 0 below } else - ASSERT(HERE,0,"Unsupported test type! (Neither LL,PRP nor P-1)"); + ASSERT(0,"Unsupported test type! (Neither LL,PRP nor P-1)"); } else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { #ifdef USE_ARM_V8_SIMD - ASSERT(HERE, 0, "ARMv8 SIMD builds do not support Fermat-number testing!"); + ASSERT(0, "ARMv8 SIMD builds do not support Fermat-number testing!"); #endif - ASSERT(HERE,findex >= 13 && findex < 64, "Fermat number index must be in range [13,63]!\n"); + ASSERT(findex >= 13 && findex < 64, "Fermat number index must be in range [13,63]!\n"); // This takes care of the number-to-char conversion and leading-whitespace-removal // in one step - use PSTRING for temporary storage here: strcpy(ESTRING, &PSTRING[convert_uint64_base10_char(PSTRING, (uint64)findex)]); - ASSERT(HERE, (p >> findex) == 1,"Require (p >> findex) == 1"); - sprintf(BIN_EXP,"%llu",p); // May need this for workfile postprocessing if assignment is in KBNC format + ASSERT((p >> findex) == 1,"Require (p >> findex) == 1"); + sprintf(BIN_EXP,"%" PRIu64,p); // May need this for workfile postprocessing if assignment is in KBNC format TRANSFORM_TYPE = RIGHT_ANGLE; sprintf(PSTRING, "F%u", findex); if(TEST_TYPE == TEST_TYPE_PRIMALITY) { @@ -1215,17 +1215,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov PRP_BASE = 2; // v20: Pépin test doesn't use this as the initial seed (that defaults to 3), but rather for the random-shift // offsets used to prevent the shift count from modding to 0 as a result of repeated doublings (mod 2^m) } else if(TEST_TYPE == TEST_TYPE_PRP) { - ASSERT(HERE, KNOWN_FACTORS[0] != 0, "Fermat-mod PRP test implies a PRP-CF run, but no known-factors provided!"); + ASSERT(KNOWN_FACTORS[0] != 0, "Fermat-mod PRP test implies a PRP-CF run, but no known-factors provided!"); RES_SHIFT = 0ull; // Must set = 0 here to make sure BASE_MULTIPLIER_BITS array gets set = 0 below } else if(TEST_TYPE == TEST_TYPE_PM1) { // Compute stage 1 prime-powers product, store in PM1_S1_PRODUCT, store #bits of same in PM1_S1_PROD_BITS: pm1_check_bounds(); s1p_alloc = compute_pm1_s1_product(p); maxiter = PM1_S1_PROD_BITS; // NOTE: In this case we don't want to override the PRP_BASE = 3 value set in compute_pm1_s1_product() - ASSERT(HERE, B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!"); + ASSERT(B1 > 0 && maxiter > B1, "P-1 b1 and/or maxiter unset!"); RES_SHIFT = 0ull; // Must set = 0 here to make sure BASE_MULTIPLIER_BITS array gets set = 0 below } else - ASSERT(HERE,0,"Unsupported test type! (Neither Pepin-primality nor P-1)"); + ASSERT(0,"Unsupported test type! (Neither Pepin-primality nor P-1)"); j = ((ITERS_BETWEEN_CHECKPOINTS+63) >> 6); if(RES_SHIFT == 0ull) { @@ -1244,14 +1244,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } } else { - ASSERT(HERE, 0,"Unknown Self-Test Modulus Type!"); + ASSERT(0,"Unknown Self-Test Modulus Type!"); } /* endif(MODULUS_TYPE) */ // mi64_shlc currently limited to 32-bit shift counts - for technical reasons described in comments at top of that function, // the largest exponent testable-with-shift must satisfy condition below, which yields largest M(p) with p = 4294967231 = 2^32-65: if(RES_SHIFT && (p+63) > 0xFFFFFFFFull) { sprintf(cbuf,"ERROR: Exponents this large do not support residue shift! Please run with '-shift 0'.\n"); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } /* In production-run (INTERACT = False) mode, allow command-line-forced FFT lengths which are at most "one size too large" relative to the default length for the exponent in question. Supported lengths @@ -1261,7 +1261,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov kblocks = get_default_fft_length(p); if(!fft_length || (!INTERACT && MODULUS_TYPE == MODULUS_TYPE_MERSENNE && 8*fft_length > 9*kblocks)) { if(!kblocks) { - fprintf(stderr,"ERROR detected in get_default_fft_length for p = %llu.\n",p); + fprintf(stderr,"ERROR detected in get_default_fft_length for p = %" PRIu64 ".\n",p); return ERR_FFTLENGTH_ILLEGAL; } } else { @@ -1319,8 +1319,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov /* Only allow lengths that are <= 2x default */ if( !(i >= kblocks && i <= (kblocks<<1) ) ) { - sprintf(cbuf,"Call to get_preferred_fft_radix returns out-of-range FFT length: asked for %u, returned %u, packed value= 0x%8X\n", kblocks, i, dum); - ASSERT(HERE, 0, cbuf); + sprintf(cbuf,"Call to get_preferred_fft_radix returns out-of-range FFT length: asked for %u, returned %u, packed value= %#8X\n", kblocks, i, dum); + ASSERT(0, cbuf); } else /* If length acceptable, extract the FFT-radix data encoded and populate the NRADICES and RADIX_VEC[] globals */ { @@ -1329,9 +1329,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov /* Make sure the FFT length is supported: */ if(get_fft_radices(kblocks, 0, 0x0, 0x0, 0) != 0) { - ASSERT(HERE, get_fft_radices(kblocks, 0, 0x0, 0x0, 0) == ERR_FFTLENGTH_ILLEGAL, "Unexpected return value for get_fft_radices()"); + ASSERT(get_fft_radices(kblocks, 0, 0x0, 0x0, 0) == ERR_FFTLENGTH_ILLEGAL, "Unexpected return value for get_fft_radices()"); sprintf(cbuf, "ERROR: length %d = %d K not available.\n",n,kblocks); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } } @@ -1348,7 +1348,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov print a warning if the p/pmax ratio > 1 to an acceptably small degree; error out if the ratio is unreasonably > 1: */ uint64 pmax_rec = given_N_get_maxP(n); double exp_ratio = (double)p/pmax_rec; - fprintf(stderr, "INFO: Maximum recommended exponent for FFT length (%u Kdbl) = %llu; p[ = %llu]/pmax_rec = %12.10f.\n",kblocks,pmax_rec,p,exp_ratio); + fprintf(stderr, "INFO: Maximum recommended exponent for FFT length (%u Kdbl) = %" PRIu64 "; p[ = %" PRIu64 "]/pmax_rec = %12.10f.\n",kblocks,pmax_rec,p,exp_ratio); // Set initial value of USE_SHORT_CY_CHAIN based on how close p/pmax is to 1.0, but only if current chain length is longer // (e.g. if ROE-retry logic has led to a shorter-than-default chain length, don't revert to default): if(exp_ratio > 0.99 && USE_SHORT_CY_CHAIN < 3) @@ -1369,7 +1369,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov fprintf(stderr, "INFO: specified FFT length %d K is less than recommended %d K for this p.\n",kblocks,i); else { sprintf(cbuf, "ERROR: specified FFT length %d K is much too small: Recommended length for this p = %d K ... quitting.\n",kblocks,i); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } } @@ -1383,14 +1383,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } /*...If array padding turned on, check that the blocklength divides the unpadded runlength... */ if((DAT_BITS < 31) && ((n >> DAT_BITS) << DAT_BITS) != n) - ASSERT(HERE, 0,"ERROR: blocklength does not divide runlength!"); + ASSERT(0,"ERROR: blocklength does not divide runlength!"); /*...Find padded array length... */ npad = n + ( (n >> DAT_BITS) << PAD_BITS ); /* length of padded data array. */ /* If the residue and other modulus-size-dependent data arrays too small for the new assignment, deallocate them: */ if(nalloc > 0 && npad > nalloc) { - ASSERT(HERE, a_ptmp != 0x0 && a != 0x0 && b != 0x0 && c != 0x0 && d != 0x0,"Require (a_ptmp,a,b,c,d) != 0x0"); + ASSERT(a_ptmp != 0x0 && a != 0x0 && b != 0x0 && c != 0x0 && d != 0x0,"Require (a_ptmp,a,b,c,d) != 0x0"); free((void *)a_ptmp); a_ptmp = a = b = c = d = e = 0x0; b_uint64_ptr = c_uint64_ptr = d_uint64_ptr = e_uint64_ptr = 0x0; free((void *)arrtmp); arrtmp=0x0; free((void *)BIGWORD_BITMAP); BIGWORD_BITMAP = 0x0; @@ -1406,9 +1406,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov j = 0; if(npad & 7) j = 8 - (npad & 7); - nalloc = npad + j; ASSERT(HERE, (nalloc & 7) == 0,"nalloc must be a multiple of 8!"); // This is so b,c,d enjoy same 64-byte alignment as a[] + nalloc = npad + j; ASSERT((nalloc & 7) == 0,"nalloc must be a multiple of 8!"); // This is so b,c,d enjoy same 64-byte alignment as a[] nbytes = nalloc<<3; - ASSERT(HERE, a_ptmp == 0x0 && a == 0x0 && b == 0x0 && c == 0x0 && d == 0x0 && e == 0x0 && arrtmp == 0x0,"Require (a_ptmp,b,c,d,e,arrtmp) == 0x0"); + ASSERT(a_ptmp == 0x0 && a == 0x0 && b == 0x0 && c == 0x0 && d == 0x0 && e == 0x0 && arrtmp == 0x0,"Require (a_ptmp,b,c,d,e,arrtmp) == 0x0"); if(use_lowmem == 2) { // Handy for huge-FFT self-tests on low-mem systems sprintf(cbuf,"WARN: Low-memory[%u] run mode disallows PRP-testing|Gerbicz-check and p-1 stage 2.\n",use_lowmem); mlucas_fprint(cbuf,1); @@ -1416,11 +1416,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } else { j = 5; } - a_ptmp = ALLOC_DOUBLE(a_ptmp, j*nalloc); if(!a_ptmp){ sprintf(cbuf, "ERROR: unable to allocate array A in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + a_ptmp = ALLOC_DOUBLE(a_ptmp, j*nalloc); if(!a_ptmp){ sprintf(cbuf, "ERROR: unable to allocate array A in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } a = ALIGN_DOUBLE(a_ptmp); - ASSERT(HERE, ((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!"); + ASSERT(((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!"); if(((intptr_t)a & 127) != 0x0) - fprintf(stderr, "WARN: a[] = 0x%08lX not aligned on 128-byte boundary!\n", (intptr_t)a); + fprintf(stderr, "WARN: a[] = %#08" PRIXPTR " not aligned on 128-byte boundary!\n", (intptr_t)a); // v19: Add three more full-residue arrays to support 2-input FFT-modmul needed for Gerbicz check (and later, p-1 support): if(use_lowmem < 2) { b = a + nalloc; c = b + nalloc; d = c + nalloc, e = d + nalloc; @@ -1431,14 +1431,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // For multi-FFT-length self-tests, conservatively figure as many as 20 bits (2.5 bytes) per float-double residue word: // v20: for largest currently supported FFT of 512Mdoubles, i still -barely - fits in a uint32, but 2.5*i does not: arrtmp_alloc = i; arrtmp_alloc = MAX((p+63)>>2, (uint64)(arrtmp_alloc*2.5)) >> 3; // #limb needed to store p bits = (p+63)>>6, so alloc at least 2x this - arrtmp = ALLOC_UINT64(arrtmp, arrtmp_alloc);if(!arrtmp ){ sprintf(cbuf, "ERROR: unable to allocate array ARRTMP with %u bytes in main.\n",i); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + arrtmp = ALLOC_UINT64(arrtmp, arrtmp_alloc);if(!arrtmp ){ sprintf(cbuf, "ERROR: unable to allocate array ARRTMP with %u bytes in main.\n",i); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } // For an n-word main-array, BIGWORD_BITMAP and BIGWORD_NBITS have (n/64) elts each, thus need 1/64 + 1/32 the total // storage of the main-array. Use uint64 alloc-macro for both, so halve the num-elts arg for the BIGWORD_NBITS alloc. // As with above arrays, for multi-length self-test, alloc based on max. FFT length used (i) rather than current length (n). // Don't need any array padding on these bitmap arrays, but since nalloc includes padding, no harm in using it: - BIGWORD_BITMAP = ALLOC_UINT64(BIGWORD_BITMAP, nalloc>>6); if(!BIGWORD_BITMAP){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_BITMAP in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - BIGWORD_NBITS = (uint32 *)ALLOC_UINT64(BIGWORD_NBITS , nalloc>>7); if(!BIGWORD_NBITS ){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_NBITS in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + BIGWORD_BITMAP = ALLOC_UINT64(BIGWORD_BITMAP, nalloc>>6); if(!BIGWORD_BITMAP){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_BITMAP in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + BIGWORD_NBITS = (uint32 *)ALLOC_UINT64(BIGWORD_NBITS , nalloc>>7); if(!BIGWORD_NBITS ){ sprintf(cbuf, "ERROR: unable to allocate array BIGWORD_NBITS in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } // Multithreaded-code debug: Set address to watch: #ifdef MULTITHREAD @@ -1456,8 +1456,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov i = ITERS_BETWEEN_GCHECKS; j = ITERS_BETWEEN_GCHECK_UPDATES; k = ITERS_BETWEEN_CHECKPOINTS; - ASSERT(HERE, i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)"); - ASSERT(HERE, i%k == 0 && k%j == 0, "G-checkproduct update interval must divide savefile-update one, which must divide the G-check interval"); + ASSERT(i == j*j, "#iterations between Gerbicz-checksum updates must = sqrt(#iterations between residue-integrity checks)"); + ASSERT(i%k == 0 && k%j == 0, "G-checkproduct update interval must divide savefile-update one, which must divide the G-check interval"); } // PRP-test: Init bitwise multiply-by-base array - cf. comment re. modified Fermat-PRP needed by Gerbicz check @@ -1483,7 +1483,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(fp) { if(TEST_TYPE == TEST_TYPE_PRP) { dum = PRP_BASE; - ASSERT(HERE, use_lowmem < 2, "PRP-test mode not available in Low-memory[2] run mode!"); + ASSERT(use_lowmem < 2, "PRP-test mode not available in Low-memory[2] run mode!"); } i = read_ppm1_savefiles(cstr, p, &j, fp, &itmp64, (uint8*)arrtmp , &Res64,&Res35m1,&Res36m1, // Primality-test residue @@ -1495,17 +1495,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(strstr(cbuf, "read_ppm1_savefiles")) mlucas_fprint(cbuf,1); /* And now for the official spokesmessage: */ - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: read_ppm1_savefiles Failed on savefile %s!\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: read_ppm1_savefiles Failed on savefile %s!\n",cstr); mlucas_fprint(cbuf,1); if(ierr == ERR_GERBICZ_CHECK) { sprintf(cbuf,"Failed to correctly read last-good-Gerbicz-check data savefile!"); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } else if(cstr[0] != 'q') { cstr[0] = 'q'; goto READ_RESTART_FILE; } else { sprintf(cbuf,"Failed to correctly read both primary or secondary savefile!"); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } } // If user attempts to restart run with different PRP base than it was started with, ignore the new value and continue with the initial one: @@ -1546,33 +1546,33 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov */ if(ierr == ERR_GERBICZ_CHECK) { MOD_ADD64(RES_SHIFT,RES_SHIFT,p,RES_SHIFT); - snprintf_nowarn(cbuf,STR_MAX_LEN, "Gerbicz-check-error restart: Mod-doubling residue shift to avoid repeating any possible fractional-error aliasing in retry, new shift = %llu\n",RES_SHIFT); + snprintf(cbuf,STR_MAX_LEN*2, "Gerbicz-check-error restart: Mod-doubling residue shift to avoid repeating any possible fractional-error aliasing in retry, new shift = %" PRIu64 "\n",RES_SHIFT); mlucas_fprint(cbuf,1); } /* Allocate floating-point residue array and convert savefile bytewise residue to floating-point form, after first applying required circular shift read into the global RES_SHIFT during the above bytewise-savefile read. */ if(!convert_res_bytewise_FP((uint8*)arrtmp, a, n, p)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",cstr); mlucas_fprint(cbuf,0); if(cstr[0] != 'q' && !(ierr == ERR_GERBICZ_CHECK)) { // Secondary savefile only exists for regular checkpoint files cstr[0] = 'q'; goto READ_RESTART_FILE; } else { - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } } // v19: G-check residue - we only create savefile for PRP-phase of any PRP-CF run, i.e. always expect a G-check residue: if(DO_GCHECK) { if(!convert_res_bytewise_FP((uint8*)e_uint64_ptr, b, n, p)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on Gerbicz-check residue read from savefile %s!\n",cstr); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on Gerbicz-check residue read from savefile %s!\n",cstr); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } else { ierr = 0; s1 = sum64(b_uint64_ptr, n); s2 = s3 = s1; // Init triply-redundant checksum of G-checkproduct } } - ASSERT(HERE, ilo > 0,"Require ilo > 0!"); + ASSERT(ilo > 0,"Require ilo > 0!"); ihi = ilo+ITERS_BETWEEN_CHECKPOINTS; /* If for some reason last checkpoint was at a non-multiple of ITERS_BETWEEN_CHECKPOINTS, round down: */ ihi-= ihi%ITERS_BETWEEN_CHECKPOINTS; @@ -1584,11 +1584,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov { /* If we're on the primary restart file, set up for secondary: */ if(ierr == ERR_GERBICZ_CHECK || s2_continuation) { // Secondary savefile only exists for regular checkpoint files - snprintf_nowarn(cbuf,STR_MAX_LEN, "INFO: Needed restart file %s not found...moving on to next assignment in %s.\n",cstr,WORKFILE); + snprintf(cbuf,STR_MAX_LEN*2, "INFO: Needed restart file %s not found...moving on to next assignment in %s.\n",cstr,WORKFILE); mlucas_fprint(cbuf,1); goto GET_NEXT_ASSIGNMENT; } else if(cstr[0] != 'q') { - snprintf_nowarn(cbuf,STR_MAX_LEN, "INFO: primary restart file %s not found...looking for secondary...\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "INFO: primary restart file %s not found...looking for secondary...\n",cstr); mlucas_fprint(cbuf,1); cstr[0] = 'q'; goto READ_RESTART_FILE; @@ -1611,11 +1611,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov ihi = ITERS_BETWEEN_CHECKPOINTS; } - ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!"); - ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); + ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!"); + ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); /* If at the start of a p-1 or primality test, set the initial seed for the run: */ - ASSERT(HERE, TEST_TYPE <= TEST_TYPE_MAX,"Given TEST_TYPE not supported!"); + ASSERT(TEST_TYPE <= TEST_TYPE_MAX,"Given TEST_TYPE not supported!"); if(ilo == 0) { memset(a, 0, npad*sizeof(double)); @@ -1638,7 +1638,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // In theory could allow residue-shift during P-1, at least in stage 1, but in practice need the BASE_MULTIPLIER_BITS array // to hold the part of the stage 1 prime-powers product needed for the current iteration interval of the stage 1 powering: if(TEST_TYPE == TEST_TYPE_PM1) { - ASSERT(HERE, RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n"); + ASSERT(RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n"); RES_SHIFT = 0ull; a[0] = iseed; } else { // Apply initial-residue shift - if user has not set one via cmd-line or current value >= p, randomly choose a value in [0,p). @@ -1658,24 +1658,24 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } // Since residue is otherwise 0, use shifted-carryin function on double-precision padded-array residue: itmp64 = shift_word(a, n, p, RES_SHIFT, (double)iseed); // Note return value (specifically high 7 bytes thereof) is an unpadded index - ASSERT(HERE, (itmp64 >> 8) < n , "Return value of shift_word(): unpadded-array-index out of range!"); - ASSERT(HERE, (itmp64 & 255) < ceil((double)p/n), "Return value of shift_word(): bit-in-array-word value out of range!"); + ASSERT((itmp64 >> 8) < n , "Return value of shift_word(): unpadded-array-index out of range!"); + ASSERT((itmp64 & 255) < ceil((double)p/n), "Return value of shift_word(): bit-in-array-word value out of range!"); } } else if(DO_GCHECK) { if(MODULUS_TYPE == MODULUS_TYPE_FERMAT && TEST_TYPE == TEST_TYPE_PRIMALITY && !INTERACT) { // Allow shift in timing-test mode - ASSERT(HERE, RES_SHIFT == 0ull, "Shifted residues unsupported for Pépin test with Gerbicz check!\n"); + ASSERT(RES_SHIFT == 0ull, "Shifted residues unsupported for Pépin test with Gerbicz check!\n"); } memcpy(d, b, nbytes); // If doing a PRP test, init redundant copy d[] Gerbicz residue-product accumulator b[]. } if(restart) { - if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf_nowarn(cbuf,STR_MAX_LEN, "Restarting %s at iteration = %u, residue shift count = %llu.\nRes64,Res35m1,Res36m1: %016llX,%llu,%llu\n",PSTRING,ilo,RES_SHIFT,Res64,Res35m1,Res36m1); - else snprintf_nowarn(cbuf,STR_MAX_LEN, "Restarting %s at iteration = %u. Res64: %016llX, residue shift count = %llu\n",PSTRING,ilo,Res64,RES_SHIFT); + if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf(cbuf,STR_MAX_LEN*2, "Restarting %s at iteration = %u, residue shift count = %" PRIu64 ".\nRes64,Res35m1,Res36m1: %016" PRIX64 ",%" PRIu64 ",%" PRIu64 "\n",PSTRING,ilo,RES_SHIFT,Res64,Res35m1,Res36m1); + else snprintf(cbuf,STR_MAX_LEN*2, "Restarting %s at iteration = %u. Res64: %016" PRIX64 ", residue shift count = %" PRIu64 "\n",PSTRING,ilo,Res64,RES_SHIFT); mlucas_fprint(cbuf,0); } /*...Restart and FFT info. */ - snprintf_nowarn(cbuf,STR_MAX_LEN,"%s: using FFT length %uK = %u 8-byte floats, initial residue shift count = %llu\n",PSTRING,kblocks,n,RES_SHIFT); + snprintf(cbuf,STR_MAX_LEN*2,"%s: using FFT length %uK = %u 8-byte floats, initial residue shift count = %" PRIu64 "\n",PSTRING,kblocks,n,RES_SHIFT); sprintf(cstr,"This gives an average %20.15f bits per digit\n",1.0*p/n); strcat(cbuf,cstr); if(TEST_TYPE == TEST_TYPE_PRP) { sprintf(cstr,"The test will be done in form of a %u-PRP test.\n",PRP_BASE); strcat(cbuf,cstr); @@ -1774,13 +1774,13 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov int update_shift = (RES_SHIFT != 0ull); // If shift = 0 at outset, don't update (only need for Fermat-mod, due to the random-bit aspect there) if(TEST_TYPE == TEST_TYPE_PM1 && ilo >= maxiter) { - ASSERT(HERE, ilo == maxiter && ilo == PM1_S1_PROD_BITS,"For completed S1 expect ilo == maxiter == PM1_S1_PROD_BITS!"); - snprintf_nowarn(cbuf,STR_MAX_LEN, "%s: p-1 stage 1 to b1 = %u already done -- proceeding to stage 2.\n",PSTRING,B1); + ASSERT(ilo == maxiter && ilo == PM1_S1_PROD_BITS,"For completed S1 expect ilo == maxiter == PM1_S1_PROD_BITS!"); + snprintf(cbuf,STR_MAX_LEN*2, "%s: p-1 stage 1 to b1 = %u already done -- proceeding to stage 2.\n",PSTRING,B1); fprintf(stderr,"%s",cbuf); ilo = ihi; // Need this to differentiate between just-completed S1 and S1 residue read from restart file, goto PM1_STAGE2;// in terms of whether we need to do a GCD before proceeding to S2 } else if(KNOWN_FACTORS[0] != 0ull) { // PRP-CF - but if ilo < (p-1) it's in the PRP-phase, handle like regular PRP run until that completes - ASSERT(HERE, TEST_TYPE == TEST_TYPE_PRP,"One or more known-factors in workfile entry requires a PRP= assignment type!"); + ASSERT(TEST_TYPE == TEST_TYPE_PRP,"One or more known-factors in workfile entry requires a PRP= assignment type!"); if( ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) && (ilo >= p)) || ((MODULUS_TYPE == MODULUS_TYPE_FERMAT) && (ilo >= p-1)) ) goto PM1_STAGE2; // The CF-handling is a clause of the if/else beginning at this label @@ -1788,7 +1788,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov for(;;) { - ASSERT(HERE, maxiter > 0,"Require (uint32)maxiter > 0"); + ASSERT(maxiter > 0,"Require (uint32)maxiter > 0"); if(ihi > maxiter) ihi = maxiter; // If p-1: start of each iteration cycle, copy bits ilo:ihi-1 of PM1_S1_PRODUCT into low bits of BASE_MULTIPLIER_BITS vector: @@ -1806,8 +1806,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov itmp64 = ~(-1ull << j); BASE_MULTIPLIER_BITS[i-1] &= itmp64;// ...and zero any excess bits at the high end. for(i = 0, itmp64 = 0ull; i < s1p_alloc; i++) { itmp64 += PM1_S1_PRODUCT[i]; } if(itmp64 != PM1_S1_PROD_RES64) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"PM1_S1_PRODUCT (mod 2^64_ checksum mismatch! (Current[%llu] != Reference[%llu]). Aborting due to suspected data corruption.\n",itmp64,PM1_S1_PROD_RES64); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"PM1_S1_PRODUCT (mod 2^64_ checksum mismatch! (Current[%" PRIu64 "] != Reference[%" PRIu64 "]). Aborting due to suspected data corruption.\n",itmp64,PM1_S1_PROD_RES64); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } } /* Here's the big one - (ITERS_BETWEEN_CHECKPOINTS) squaring steps. @@ -1827,7 +1827,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov i = ilo; tdiff = 0.0; // Need 2 timers here - tdif2 for the individual func_mod_square calls, accumulate in tdiff while(!ierr && MLUCAS_KEEP_RUNNING && i < ihi) { // See G-check code for why this logfile-print of initial-G-check-update residue shift value is needed in Fermat-mod case: - if(i == ITERS_BETWEEN_GCHECK_UPDATES) { sprintf(cbuf,"At iter ITERS_BETWEEN_GCHECK_UPDATES = %u: RES_SHIFT = %llu\n",i,RES_SHIFT); mlucas_fprint(cbuf,1); } + if(i == ITERS_BETWEEN_GCHECK_UPDATES) { sprintf(cbuf,"At iter ITERS_BETWEEN_GCHECK_UPDATES = %u: RES_SHIFT = %" PRIu64 "\n",i,RES_SHIFT); mlucas_fprint(cbuf,1); } /* If restart-after-interrupt and thus ilo neither a non-multiple of ITERS_BETWEEN_CHECKPOINTS nor of ITERS_BETWEEN_GCHECK_UPDATES, round first i-update > ilo to nearest multiple of ITERS_BETWEEN_GCHECK_UPDATES: */ @@ -1877,8 +1877,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov fprintf(stderr,"Caught interrupt in fFFT(c) step.\n"); break; } else { - snprintf_nowarn(cbuf,STR_MAX_LEN,"Unhandled Error of type[%u] = %s in fFFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr)); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"Unhandled Error of type[%u] = %s in fFFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr)); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); // goto GET_NEXT_ASSIGNMENT; } } @@ -1891,7 +1891,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } else if(s1 == sum64(d_uint64_ptr, n)) { // c-data good, copy back into b memcpy(d, b, nbytes); } else // Catastrophic data corruption - ASSERT(HERE, 0, "Catastrophic data corruption detected in G-checkproduct integrity validation ... rolling back to last good G-check. "); + ASSERT(0, "Catastrophic data corruption detected in G-checkproduct integrity validation ... rolling back to last good G-check. "); } // First subinterval: [b] needs fwd-weighting and initial-fwd-FFT-pass done on entry, !undone on exit: mode_flag = 10_2 @@ -1907,8 +1907,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov fprintf(stderr,"Caught interrupt in FFT(b)*FFT(c) step.\n"); break; } else { - snprintf_nowarn(cbuf,STR_MAX_LEN,"Unhandled Error of type[%u] = %s in FFT(b)*FFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr)); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"Unhandled Error of type[%u] = %s in FFT(b)*FFT(c) step - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr)); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); // goto GET_NEXT_ASSIGNMENT; } } @@ -1991,7 +1991,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } // v20: Simplify the logic here - skip previous interval-retry step: if((ierr == ERR_ROUNDOFF) && !INTERACT) { - ASSERT(HERE, ROE_ITER > 0, "ERR_ROUNDOFF returned but ROE_ITER <= 0!"); + ASSERT(ROE_ITER > 0, "ERR_ROUNDOFF returned but ROE_ITER <= 0!"); n = get_nextlarger_fft_length(n); kblocks = (n >> 10); sprintf(cbuf," Switching to next-larger available FFT length %uK and restarting from last checkpoint file.\n",kblocks); mlucas_fprint(cbuf,1); @@ -2049,7 +2049,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S",local_time); const char*iter_or_stage[] = {"Iter#","S1 bit"}; // Tag indicates Primality/PRP-test or p-1 S1 iteration /*...print [date in hh:mm:ss | p | iter-count-or-stage progress | %-complete | time | per-iter time | Res64 | max ROE | residue-shift] */ - snprintf_nowarn(cbuf,STR_MAX_LEN, "[%s] %s %s = %u [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016llX. AvgMaxErr = %10.9f. MaxErr = %10.9f. Residue shift count = %llu.\n" + snprintf(cbuf,STR_MAX_LEN*2, "[%s] %s %s = %u [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016" PRIX64 ". AvgMaxErr = %10.9f. MaxErr = %10.9f. Residue shift count = %" PRIu64 ".\n" , timebuffer, PSTRING, iter_or_stage[TEST_TYPE == TEST_TYPE_PM1], ihi, (float)ihi / (float)maxiter * 100,get_time_str(tdiff) , 1000*get_time(tdiff)/(ihi - ilo), Res64, AME, MME, RES_SHIFT); mlucas_fprint(cbuf,scrnFlag); @@ -2089,8 +2089,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov fprintf(stderr,"Caught interrupt in Gerbicz-checkproduct mod-squaring update ... skipping G-check and savefile-update and performing immediate-exit.\n"); exit(1); } else { - snprintf_nowarn(cbuf,STR_MAX_LEN,"Unhandled Error of type[%u] = %s in Gerbicz-checkproduct mod-squaring update - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr)); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"Unhandled Error of type[%u] = %s in Gerbicz-checkproduct mod-squaring update - please send e-mail to ewmayer@aol.com with copy of the p*.stat file attached. Proceeding to next assignment...\n",ierr,returnMlucasErrCode(ierr)); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); // goto GET_NEXT_ASSIGNMENT; } } @@ -2127,7 +2127,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(filegrep(STATFILE,"ITERS_BETWEEN_GCHECK_UPDATES",cbuf,0)) { char_addr = strstr(cbuf,"RES_SHIFT = ") + 12; // Skip ahead by length of search-substring itmp64 = strtoull(char_addr, &cptr, 10); - ASSERT(HERE, itmp64 != -1ull, "strtoull() overflow detected."); + ASSERT(itmp64 != -1ull, "strtoull() overflow detected."); } #else itmp64 = RES_SHIFT; @@ -2145,8 +2145,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } #endif } - fprintf(stderr,"Recovered initial shift %llu\n",itmp64); - ASSERT(HERE, (itmp64>>32) == 0ull,"Shift must be < 2^32!"); + fprintf(stderr,"Recovered initial shift %" PRIu64 "\n",itmp64); + ASSERT((itmp64>>32) == 0ull,"Shift must be < 2^32!"); GCHECK_SHIFT = itmp64; } mi64_shlc(c_uint64_ptr, c_uint64_ptr, (uint32)p, (uint32)GCHECK_SHIFT, j, (MODULUS_TYPE == MODULUS_TYPE_FERMAT)); @@ -2155,7 +2155,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // Use mi64 routines to compute d[]*PRP_BASE and do ensuing equality check: itmp64 = ((MODULUS_TYPE == MODULUS_TYPE_FERMAT) ? 3ull : (uint64)PRP_BASE); // Fermat-mod uses PRP_BASE to store 2 for random-shift-offset scheme c_uint64_ptr[j] = mi64_mul_scalar(c_uint64_ptr, itmp64, c_uint64_ptr, j); - ASSERT(HERE, c_uint64_ptr[j] == 0ull, "d[]*PRP_BASE result has unexpected carryout!"); + ASSERT(c_uint64_ptr[j] == 0ull, "d[]*PRP_BASE result has unexpected carryout!"); // Need to (mod N) ... store modulus N in d[] doubles-array, which is freed up by above convert_res_FP_bytewise(d,...) call: if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { // Loop rather than call to mi64_set_eq_scalar here, since need to set all elts = -1: @@ -2169,11 +2169,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov for(i = 1; i < itmp64; i++) { if(!c_uint64_ptr[j] && mi64_cmpult(c_uint64_ptr,d_uint64_ptr,j)) break; cy = mi64_sub(c_uint64_ptr,d_uint64_ptr,c_uint64_ptr,j); // c -= d, with d = 2^p-1 - c_uint64_ptr[j] -= cy; //ASSERT(HERE, cy == 0ull, "mi64_sub result has unexpected borrow!"); + c_uint64_ptr[j] -= cy; //ASSERT(cy == 0ull, "mi64_sub result has unexpected borrow!"); } - ASSERT(HERE, mi64_cmpult(c_uint64_ptr,d_uint64_ptr,j), "Gerbicz checkproduct reduction (mod 2^p-1) failed!"); + ASSERT(mi64_cmpult(c_uint64_ptr,d_uint64_ptr,j), "Gerbicz checkproduct reduction (mod 2^p-1) failed!"); if(mi64_cmp_eq(e_uint64_ptr,c_uint64_ptr,j)) { - sprintf(cbuf,"At iteration %u, shift = %llu: Gerbicz check passed.\n",ihi,RES_SHIFT); + sprintf(cbuf,"At iteration %u, shift = %" PRIu64 ": Gerbicz check passed.\n",ihi,RES_SHIFT); mlucas_fprint(cbuf,0); // In G-check case we need b[] for that, thus skipped the d = b redundancy-copy ... do that now: memcpy(d, b, nbytes); @@ -2214,7 +2214,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov strcpy(cstr, RESTARTFILE); strcat(cstr, cbuf); if(rename(RESTARTFILE, cstr)) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: unable to rename %s restart file ==> %s ... skipping every-10M-iteration restart file archiving\n",WORKFILE,cstr); + snprintf(cbuf,STR_MAX_LEN*2,"ERROR: unable to rename %s restart file ==> %s ... skipping every-10M-iteration restart file archiving\n",WORKFILE,cstr); fprintf(stderr,"%s",cbuf); } } // ilo a multiple of 10 million? @@ -2235,7 +2235,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov RESTARTFILE[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f'); } } else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open restart file %s for write of checkpoint data.\n",RESTARTFILE); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open restart file %s for write of checkpoint data.\n",RESTARTFILE); mlucas_fprint(cbuf,1); /* Don't want to assert here - asllow processing to continue, in case this is a transient failure-to-open. @@ -2255,7 +2255,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov write_ppm1_savefiles(cstr,p,n,fp, itmp64, (uint8*)arrtmp,Res64,Res35m1,Res36m1, (uint8*)e_uint64_ptr,i1,i2,i3); fclose(fp); fp = 0x0; } else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open Gerbicz-check savefile %s for write of checkpoint data.\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open Gerbicz-check savefile %s for write of checkpoint data.\n",cstr); mlucas_fprint(cbuf,1); } } // ihi a multiple of ITERS_BETWEEN_GCHECKS? @@ -2305,7 +2305,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov /* If Selftest mode... */ if(INTERACT) { - fprintf(stderr, "%u iterations of %s with FFT length %u = %u K, final residue shift count = %llu\n",timing_test_iters,PSTRING,n,kblocks,RES_SHIFT); + fprintf(stderr, "%u iterations of %s with FFT length %u = %u K, final residue shift count = %" PRIu64 "\n",timing_test_iters,PSTRING,n,kblocks,RES_SHIFT); // If TEST_TYPE non-default (e.g. PRP for Mersennes), add text indicating that: if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE && TEST_TYPE == TEST_TYPE_PRP) sprintf(cbuf,"PRP-%u ",PRP_BASE); @@ -2314,15 +2314,15 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov /* If Fermat number, make sure exponent a power of 2: */ if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) - ASSERT(HERE, (p >> findex) == 1,"Require (p >> findex) == 1"); + ASSERT((p >> findex) == 1,"Require (p >> findex) == 1"); if(timing_test_iters > AME_ITER_START) { AME /= (timing_test_iters - AME_ITER_START); - fprintf(stderr, "%sRes64: %016llX. AvgMaxErr = %10.9f. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, AME, MME, VERSION); + fprintf(stderr, "%sRes64: %016" PRIX64 ". AvgMaxErr = %10.9f. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, AME, MME, VERSION); } else { - fprintf(stderr, "%sRes64: %016llX. AvgMaxErr N/A. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, MME, VERSION); + fprintf(stderr, "%sRes64: %016" PRIX64 ". AvgMaxErr N/A. MaxErr = %10.9f. Program: E%s\n", cbuf, Res64, MME, VERSION); } - /* MSVC/.NET incorrectly output these when using uint64 and %20llu format, so cast to double and print: */ + /* MSVC/.NET incorrectly output these when using uint64 and %20" PRIu64 " format, so cast to double and print: */ fprintf(stderr, "Res mod 2^35 - 1 = %20.0f\n",(double)Res35m1); fprintf(stderr, "Res mod 2^36 - 1 = %20.0f\n",(double)Res36m1); /* If they are provided, check the Selfridge-Hurwitz residues: */ @@ -2382,7 +2382,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov */ if(TEST_TYPE == TEST_TYPE_PRP && MODULUS_TYPE != MODULUS_TYPE_FERMAT) // Applies only to mod-M(p) case, { // Pepin-test and LL are handled in next clause. - ASSERT(HERE, ihi == p, "Gerbicz-check-modified PRP-test requires p mod-squarings!"); + ASSERT(ihi == p, "Gerbicz-check-modified PRP-test requires p mod-squarings!"); /* Final PRP-residue which is *reported*, OTOH, is the standard Fermat-style (p-2)-squaring one. That requires us to do 2 mod-divs of the 2-squares-too-many prp-residue r by the PRP-test base b. If b divides r, we're good. Otherwise, need to find multiple of modulus m = 2^p-1 which needs to @@ -2410,7 +2410,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov mmodb = twopmmodq64(p,itmp64); // m" // In the most common case PRP_BASE = 3, use that 2^6 == 1 (mod 9), thus 2^p == 2^(p mod 6) (mod 9) if(PRP_BASE == 3) - ASSERT(HERE, mmodb == (1ull<<(p % 6)) % 9,"2^p == 2^(p mod 6) (mod 9) fails!"); + ASSERT(mmodb == (1ull<<(p % 6)) % 9,"2^p == 2^(p mod 6) (mod 9) fails!"); // mmodb = (2^p-1) % base ... for reasons unknown, the macro MOD_SUB64 was not inlined properly under gdb if(mmodb) mmodb--; @@ -2424,14 +2424,14 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov #else MUL_LOHI64(rmodb,i1, i2, i3); #endif - i2 %= itmp64; ASSERT(HERE, i3 == 0ull, "K-multiplier needs 64-bit reduction (mod b^2)!"); + i2 %= itmp64; ASSERT(i3 == 0ull, "K-multiplier needs 64-bit reduction (mod b^2)!"); if(i2) i2 = itmp64 - i2; // if(k) k = -r".mi" (mod b^2) = b^2 - r".mi" . // i2 contains the needed multiplier k. Since ensuing quotient computation needs separate arrays // for dividend and quotient, stash output of mi64_mul_scalar_add_vec2 in c[] and ensuing quotient back in arrtmp[]: c_uint64_ptr[j] = mi64_mul_scalar_add_vec2(d_uint64_ptr,i2,arrtmp, c_uint64_ptr, j); // Now short-div - allowing for the possibility of a carryout from above mi64_mul_scalar_add_vec2() call - // by base and check that remainder 0. Note that we do want the quotient now, as that is our reside/base: - mi64_div(c_uint64_ptr, &itmp64, j+1,1, arrtmp,&rmodb); ASSERT(HERE, rmodb == 0ull,"After short-div, R != 0 (mod B)"); + mi64_div(c_uint64_ptr, &itmp64, j+1,1, arrtmp,&rmodb); ASSERT(rmodb == 0ull,"After short-div, R != 0 (mod B)"); // And recompute the S-H residues: res_SH(arrtmp,j,&Res64,&Res35m1,&Res36m1); // Now that residue is standard Fermat-PRP-test one, check if == 1: @@ -2465,7 +2465,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov char Res2048[513]; // Must save Res2048 before PRP cofactor test: https://github.com/primesearch/Mlucas/issues/25 if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) - for (int i = 31; i >= 0; i--) sprintf(Res2048+496-i*16, "%016llX", arrtmp[i]); + for (int i = 31; i >= 0; i--) sprintf(Res2048+496-i*16, "%016" PRIX64, arrtmp[i]); // v21: PRP-CF: Cofactor-PRP test applies to primality/Fermat (which we follow by 1 additional mod-squaring // to convert the base^((N-1)/2) Pepin/Euler-PRP residue to a base^(N-1) Fermat-PRP one) and PRP/Mersenne residues: @@ -2491,7 +2491,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(isprime) { if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { /*... this gets written both to file and to stdout, the latter irrespective of whether the run is in interactive mode... */ - snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is a new FERMAT PRIME!!!\nPlease send e-mail to ewmayer@aol.com.\n",PSTRING); + snprintf(cbuf,STR_MAX_LEN*2, "%s is a new FERMAT PRIME!!!\nPlease send e-mail to ewmayer@aol.com.\n",PSTRING); mlucas_fprint(cbuf,1); } else if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2501,16 +2501,16 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov break; } if(knowns[i] != 0) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is a known MERSENNE PRIME.\n",PSTRING); + snprintf(cbuf,STR_MAX_LEN*2, "%s is a known MERSENNE PRIME.\n",PSTRING); mlucas_fprint(cbuf,(INTERACT || scrnFlag)); // Latter clause == "Echo output to stderr?" } else { // This gets written both to file and to stderr, the latter irrespective of whether the run is in interactive mode: - snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is a (probable) new MERSENNE PRIME!!!\nPlease send e-mail to ewmayer@aol.com and woltman@alum.mit.edu.\n",PSTRING); + snprintf(cbuf,STR_MAX_LEN*2, "%s is a (probable) new MERSENNE PRIME!!!\nPlease send e-mail to ewmayer@aol.com and woltman@alum.mit.edu.\n",PSTRING); mlucas_fprint(cbuf,1); } } else - ASSERT(HERE, 0, "Unsupported modulus type!"); + ASSERT(0, "Unsupported modulus type!"); } /* The more likely scenario - it's not prime, so we form a 64-bit residue and write that. @@ -2520,11 +2520,11 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // Otherwise, write the 64-bit hex residue. As of v19, we write the old-style HRF-formatted result // just to the exponent-specific logfile, and the server-expected JSON-formatted result to the results file: // Note that Fermat primality tests are not submitted to server, so accordingly we slightly modify the output. More info: https://github.com/primesearch/Mlucas/pull/11 - snprintf_nowarn(cbuf,STR_MAX_LEN, "%s is not prime. Program: E%s. Final residue shift count = %llu.\n",PSTRING,VERSION,RES_SHIFT); + snprintf(cbuf,STR_MAX_LEN*2, "%s is not prime. Program: E%s. Final residue shift count = %" PRIu64 ".\n",PSTRING,VERSION,RES_SHIFT); mlucas_fprint(cbuf,1); - if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf_nowarn(cbuf,STR_MAX_LEN, "Selfridge-Hurwitz residues Res64,Res35m1,Res36m1 = %016llX,%11llu,%11llu.\n",Res64,Res35m1,Res36m1); + if (MODULUS_TYPE == MODULUS_TYPE_FERMAT) snprintf(cbuf,STR_MAX_LEN*2, "Selfridge-Hurwitz residues Res64,Res35m1,Res36m1 = %016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 ".\n",Res64,Res35m1,Res36m1); else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); // v19: Finish with the JSON-formatted result line: fp = mlucas_fopen(OFILE,"a"); if(fp) { @@ -2534,7 +2534,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov mlucas_fprint(cbuf,1); } } else if (MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { // Cofactor-PRP run: - snprintf_nowarn(cbuf,STR_MAX_LEN,"If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2,"If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); mlucas_fprint(cbuf,1); // Write JSON-formatted result line to results file: fp = mlucas_fopen(OFILE,"a"); @@ -2562,7 +2562,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov gm_time = localtime(&calendar_time); strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S UTC",gm_time); generate_JSON_report(0,p,n,0ull,NULL,timebuffer, B1,B2,gcd_str,s2_partial, cstr); // cstr holds JSONified output - snprintf_nowarn(cbuf,STR_MAX_LEN, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); mlucas_fprint(cbuf,0); fp = mlucas_fopen(OFILE,"a"); if(fp) { @@ -2596,7 +2596,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if( PM1_S2_NBUF > ((uint32)(j*1024./(n>>7)) - 5) ) fprintf(stderr,"WARNING: User-specified maximum number of Stage 2 buffers may exceed %u MB of available RAM.\n",j); } - ASSERT(HERE, PM1_S2_NBUF >= 24,"p-1 Stage 2 requires at least 24 residue-sized memory buffers!\n"); + ASSERT(PM1_S2_NBUF >= 24,"p-1 Stage 2 requires at least 24 residue-sized memory buffers!\n"); // See if S2 restart file exists: strcpy(cstr,RESTARTFILE); cstr[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f'); strcat(cstr, ".s2"); // If a regular (non-continuation, i.e. B2_start = B1) stage 2 and S2 restart file exists, read @@ -2608,17 +2608,17 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // This snip reads the relocation-prime from the high byte of the nsquares field, byte 10 of the S2 savefile: i = fgetc(fp); if(!test_types_compatible(i, TEST_TYPE)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "%s: TEST_TYPE != fgetc(fp)\n",cstr); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "%s: TEST_TYPE != fgetc(fp)\n",cstr); ASSERT(0,cbuf); } if((i = fgetc(fp)) != MODULUS_TYPE) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: %s: MODULUS_TYPE != fgetc(fp)\n",cstr); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: %s: MODULUS_TYPE != fgetc(fp)\n",cstr); ASSERT(0,cbuf); } itmp64 = 0ull; for(j = 0; j < 8; j++) { i = fgetc(fp); itmp64 += (uint64)i << (8*j); } fclose(fp); fp = 0x0; if(i != EOF) // Needed to handle case where .s2 file was touched but ended up empty or < 10 bytes long psmall = i; itmp64 &= 0x00FFFFFFFFFFFFFFull; // Mask off psmall to get stage 2 q of checkpoint data - fprintf(stderr,"Read iter = %llu and relocation-prime psmall = %u from savefile %s.\n",itmp64,psmall,cstr); + fprintf(stderr,"Read iter = %" PRIu64 " and relocation-prime psmall = %u from savefile %s.\n",itmp64,psmall,cstr); // Now parse logfile to get proper B2 and validate corresponding B2_start vs B2/[psmall from .s2 file]. // Logfiles can be messy and include one or more aborted-restarts; we want the last B2_start-containing // entry followed by a savefile-write entry, as inferred from presence of a "% complete" substring: @@ -2627,9 +2627,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // If match "B2_start =", read bigstep D from match-line and infer relocation-prime psmall from D: if(strlen(cbuf)) { char_addr = strstr(cbuf,"B2_start = "); - B2_start = (uint64)strtoull(char_addr+11, &cptr, 10); ASSERT(HERE, B2_start != -1ull, "strtoull() overflow detected."); + B2_start = (uint64)strtoull(char_addr+11, &cptr, 10); ASSERT(B2_start != -1ull, "strtoull() overflow detected."); char_addr = strstr(cbuf,"B2 = "); - B2 = (uint64)strtoull(char_addr+5, &cptr, 10); ASSERT(HERE, B2 != -1ull, "strtoull() overflow detected."); + B2 = (uint64)strtoull(char_addr+5, &cptr, 10); ASSERT(B2 != -1ull, "strtoull() overflow detected."); char_addr = strstr(cbuf,"Bigstep = "); if(char_addr) { i = strtoul(char_addr+10, &endp, 10); @@ -2644,7 +2644,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } // Now compare the params from the restartfile vs those captured in the log: if(psmall) - ASSERT(HERE, psmall == i && B2_start == B2/psmall, "Stage 2 params mismatch those captured in the .stat logfile!"); + ASSERT(psmall == i && B2_start == B2/psmall, "Stage 2 params mismatch those captured in the .stat logfile!"); else psmall = i; // If stage 2 q of checkpoint >= B2, proceed directly to GCD: @@ -2674,7 +2674,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov exit(1); } else if(ierr) { sprintf(cbuf,"p-1 stage 2 hit an unhandled error of type[%u] = %s! Aborting.",ierr,returnMlucasErrCode(ierr)); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } // If gcd_str non-empty on return, it means one of the intermediate S2 GCDs turned up a factor, // prompting an early-return, In this case the S2 code will have reset B2 to reflect the actual interval run. @@ -2694,7 +2694,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov gm_time = localtime(&calendar_time); strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S UTC",gm_time); generate_JSON_report(0,p,n,0ull,NULL,timebuffer, B1,B2,gcd_str,s2_partial, cstr); // cstr holds JSONified output - snprintf_nowarn(cbuf,STR_MAX_LEN, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2, "If using the manual results submission form at mersenne.org, paste the following JSON-formatted results line:\n%s\n",cstr); mlucas_fprint(cbuf,0); fp = mlucas_fopen(OFILE,"a"); if(fp){ @@ -2705,7 +2705,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov fprintf(stderr,"User specified low-mem run mode ... no stage 2.\n"); } } else { - ASSERT(HERE, 0, "Unrecognized test type!"); + ASSERT(0, "Unrecognized test type!"); } /* endif(TEST_TYPE == TEST_TYPE_PRIMALITY) */ /*...If successful completion, delete the secondary restart files...save the primary in case it's a prime, @@ -2728,7 +2728,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov */ strcpy(cstr, RESTARTFILE); strcat(cstr, ".s2"); if(remove(cstr)) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"INFO: Unable to remove stage 2 savefile %s.\n",cstr); + snprintf(cbuf,STR_MAX_LEN*2,"INFO: Unable to remove stage 2 savefile %s.\n",cstr); mlucas_fprint(cbuf,1); } if(!s2_continuation) { @@ -2743,7 +2743,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // if primary missing/corrupt, rename secondary q[expo] ==> [p|f][expo].s1: if(TEST_TYPE == TEST_TYPE_PM1 && !s2_continuation) { if(rename(RESTARTFILE, cstr)) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: unable to rename the p-1 stage 1 savefile %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr); + snprintf(cbuf,STR_MAX_LEN*2,"ERROR: unable to rename the p-1 stage 1 savefile %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr); mlucas_fprint(cbuf,1); } } else if(TEST_TYPE == TEST_TYPE_PRIMALITY || TEST_TYPE == TEST_TYPE_PRP) { @@ -2751,7 +2751,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(RESTARTFILE[0] == 'q') { RESTARTFILE[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f'); if(rename(cstr, RESTARTFILE)) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"ERROR: Primary savefile missing/corrupt, but unable to rename the secondary %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr); + snprintf(cbuf,STR_MAX_LEN*2,"ERROR: Primary savefile missing/corrupt, but unable to rename the secondary %s ==> %s ... any ensuing LL/PRP test will overwrite.\n",RESTARTFILE,cstr); mlucas_fprint(cbuf,1); } } else if(remove(cstr)) // ...otherwise delete the secondary @@ -2785,21 +2785,21 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(!INTERACT) { //*** IN THIS CASE MUST MAKE SURE CBUF,CSTR ONLY GET OVERWRITTEN ON ERROR ERROR, SINCE THEY CONTAIN THE SPLIT ASSIGNMENT! *** if(split_curr_assignment) { - sprintf(ESTRING,"%llu",p); // Set ESTRING here, as this bypasses the normal route for getting to GET_NEXT_ASSIGNMENT - ASSERT(HERE, TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: split_curr_assignment = TRUE, but TEST_TYPE != PM1."); + sprintf(ESTRING,"%" PRIu64,p); // Set ESTRING here, as this bypasses the normal route for getting to GET_NEXT_ASSIGNMENT + ASSERT(TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: split_curr_assignment = TRUE, but TEST_TYPE != PM1."); } fp = mlucas_fopen(WORKFILE,"r"); if(!fp) { sprintf(cbuf,"ERROR: unable to open %s file for reading.\n",WORKFILE); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } /* Remove any WINI.TMP file that may be present: */ remove("WINI.TMP"); fq = mlucas_fopen("WINI.TMP", "w"); if(!fq) { sprintf(cbuf, "Unable to open WINI.TMP file for writing.\n"); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } GET_NEXT: @@ -2807,7 +2807,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov i = 0; // This counter tells how many *additional* assignments exist in worktodo if(!fgets(in_line, STR_MAX_LEN, fp)) { sprintf(cbuf, "ERROR: %s file not found at end of current-assignment processing\n", WORKFILE); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } // v20.1.1: Parse all lines whose 1st non-WS char is alphabetic; char_addr = in_line; j = 0; @@ -2818,8 +2818,8 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov // Look for m in first eligible assignment; for F[m], need to also look for 2^m in case assignment is in KBNC format: if(!strstr(in_line, ESTRING) && !(MODULUS_TYPE == MODULUS_TYPE_FERMAT && strstr(in_line, BIN_EXP)) ) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: Current exponent %s not found in line 1 of %s file - quitting.\n", ESTRING, WORKFILE); - ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: Current exponent %s not found in line 1 of %s file - quitting.\n", ESTRING, WORKFILE); + ASSERT(0,cbuf); } else { /* If we just finished the TF or p-1 preprocessing step of an LL or PRP test, update the current-assignment line to reflect that and write it out: */ @@ -2828,7 +2828,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov if(TEST_TYPE == TEST_TYPE_TF) { /* Factor depth assumed to follow the first comma in in_line: */ char_addr = strstr(char_addr, ","); - ASSERT(HERE, char_addr != 0x0,"Null char_addr"); + ASSERT(char_addr != 0x0,"Null char_addr"); sprintf(++char_addr, "%u", TF_BITS); fputs(in_line, fq); } @@ -2843,16 +2843,16 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov } else if(stristr(in_line, "pminus1")) { // If current p-1 assignment found a factor and resulted from splitting of a PRP/LL assignment - // note that split_curr_assignment == TRUE only at time of the initial splitting - delete them both: - ASSERT(HERE, TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: current assignment is Pminus1=, but TEST_TYPE != PM1."); + ASSERT(TEST_TYPE == TEST_TYPE_PM1,"GET_NEXT_ASSIGNMENT: current assignment is Pminus1=, but TEST_TYPE != PM1."); if(strlen(gcd_str) != 0) { // Found a factor? - char_addr = strstr(in_line, "="); ASSERT(HERE,char_addr != 0x0,"Malformed assignment!"); + char_addr = strstr(in_line, "="); ASSERT(char_addr != 0x0,"Malformed assignment!"); char_addr++; if(is_hex_string(char_addr, 32)) { strncpy(aid,char_addr,32); } else if(STREQN_NOCASE(char_addr,"n/a",3)) { strncpy(aid,char_addr, 3); } else { - snprintf_nowarn(cbuf,STR_MAX_LEN,"INFO: Assignment \"%s\" lacks a valid assignment ID ... proceeding anyway.\n",in_line); + snprintf(cbuf,STR_MAX_LEN*2,"INFO: Assignment \"%s\" lacks a valid assignment ID ... proceeding anyway.\n",in_line); mlucas_fprint(cbuf,1); aid[0] = '\0'; // This guarantees that the strstr(in_line,aid) part of on the next-assignment search below succeeds. } @@ -2892,13 +2892,13 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov fp = mlucas_fopen(WORKFILE,"w"); if(!fp) { sprintf(cbuf,"ERROR: unable to open %s file for writing.\n", WORKFILE); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } fq = mlucas_fopen("WINI.TMP", "r"); if(!fq) { sprintf(cbuf,"Unable to open WINI.TMP file for reading.\n"); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } while(fgets(in_line, STR_MAX_LEN, fq)) { fputs(in_line, fp); @@ -2954,21 +2954,21 @@ void Mlucas_init(void) /* Set min. exponent (in terms of power of 2) that can be tested: */ /* Check that the purported min. FFT length is actually supported: */ - ASSERT(HERE, get_fft_radices(MIN_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MIN_FFT_LENGTH_IN_K, 0) == 0"); + ASSERT(get_fft_radices(MIN_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MIN_FFT_LENGTH_IN_K, 0) == 0"); n = (MIN_FFT_LENGTH_IN_K << 10); /* Make sure N didn't overflow */ - ASSERT(HERE, (n >> 10) == MIN_FFT_LENGTH_IN_K,"Require (n >> 10) == MIN_FFT_LENGTH_IN_K"); + ASSERT((n >> 10) == MIN_FFT_LENGTH_IN_K,"Require (n >> 10) == MIN_FFT_LENGTH_IN_K"); PMIN = 2*n; /* 2 bits per input is about the smallest we can test without getting nonzero-carry errors */ /* Set max. exponent (in terms of power of 2) that can be tested: */ /* Check that the purported max. FFT length is actually supported: */ - ASSERT(HERE, get_fft_radices(MAX_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MAX_FFT_LENGTH_IN_K, 0) == 0"); + ASSERT(get_fft_radices(MAX_FFT_LENGTH_IN_K, 0, 0x0, 0x0, 0) == 0,"Require get_fft_radices(MAX_FFT_LENGTH_IN_K, 0) == 0"); n = (MAX_FFT_LENGTH_IN_K << 10); /* Make sure N didn't overflow */ - ASSERT(HERE, (n >> 10) == MAX_FFT_LENGTH_IN_K,"Require (n >> 10) == MAX_FFT_LENGTH_IN_K"); + ASSERT((n >> 10) == MAX_FFT_LENGTH_IN_K,"Require (n >> 10) == MAX_FFT_LENGTH_IN_K"); PMAX = 1.05*given_N_get_maxP(n); // Allow same wiggle room here as in ernstMain - ASSERT(HERE, PMAX > PMIN,"Require PMAX > PMIN"); + ASSERT(PMAX > PMIN,"Require PMAX > PMIN"); #if INCLUDE_TF /* Simple self-tester for sieve factoring routines: */ @@ -2976,7 +2976,7 @@ void Mlucas_init(void) if(test_fac() != 0) { sprintf(cbuf, "Mlucas_init : Trial-factoring self-test failed.\n"); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } #endif #if 0 // v20: Use GMP GCD, own-rolled O(n*(log n)^2) one simply not in the cards. @@ -2985,7 +2985,7 @@ void Mlucas_init(void) if(test_gcd() != 0) { sprintf(cbuf, "Mlucas_init : GCD test failed.\n"); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } #endif } @@ -3038,16 +3038,16 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const uint64 nbits, itmp64; int64 retval = -1; // Make this signed to ease "not yet set?" check #ifdef USE_FGT61 - ASSERT(HERE,0,"shift_word() needs to be modified to support FGT!"); + ASSERT(0,"shift_word() needs to be modified to support FGT!"); #endif if(n != nsave || p != psave) { first_entry = TRUE; for(j = 0; j < (n>>6); j++) { BIGWORD_BITMAP[j] = 0ull; } // Need to clear bitmap in case of multi-FFT-length run bw = p%n; sw = n-bw; /* If Fermat number, make sure exponent a power of 2: */ if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE,"Require TRANSFORM_TYPE == RIGHT_ANGLE"); + ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE,"Require TRANSFORM_TYPE == RIGHT_ANGLE"); findex = trailz64(p); - ASSERT(HERE, (p >> findex) == 1,"Require (p >> findex) == 1"); + ASSERT((p >> findex) == 1,"Require (p >> findex) == 1"); /* For Fermat-mod, only need IBDWT weights table if it's a non-power-of-2-length transform, in which case the table has {nwt = odd part of N} distinct elements. Avoid if() logic related to power-of-2-or-not by initing a single DWT weight = 1.0 in the power-of-2 case and = 2^((j%nwt)/n) otherwise: @@ -3056,7 +3056,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const sw_div_n = sw*nwt/n; } else - ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER,"Require TRANSFORM_TYPE == REAL_WRAPPER"); + ASSERT(TRANSFORM_TYPE == REAL_WRAPPER,"Require TRANSFORM_TYPE == REAL_WRAPPER"); /* Vector length a power of 2? */ pow2_fft = (n >> trailz32(n)) == 1; @@ -3072,7 +3072,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const // than I'd like, likely due to cache impacts of doing random-word lookups in the resulting 128kB and 64kB BIGWORD* arrays. // Also had the "adjusting..." printfs enabled during the timing tests, 0 such adjustments needed for 10^9 random-shifts: if(!first_entry) { - // ASSERT(HERE, BIGWORD_BITMAP != 0x0 && BIGWORD_NBITS != 0x0, "BIGWORD_BITMAP and BIGWORD_NBITS arrays not alloc'ed!"); + // ASSERT(BIGWORD_BITMAP != 0x0 && BIGWORD_NBITS != 0x0, "BIGWORD_BITMAP and BIGWORD_NBITS arrays not alloc'ed!"); // Divide [shift] by the average bits per word to get a quick estimate of which word contains the corresponding bit: j = shift*words_per_bit; w64 = j>>6; mod64 = j&63; // Then exactly compute the bitcount at the resulting word, by adding the BIGWORD_NBITS-array-stored exact @@ -3086,7 +3086,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const // Can gain a few % speed by commenting out this correction-step code, but even though I've encountered // no cases where it's used in my (admittedly quite limited) testing, better safe than sorry: if(shift < ii) { - // printf("shift[%llu] < ii [%u] ... adjusting downward.\n",shift,ii); + // printf("shift[%" PRIu64 "] < ii [%u] ... adjusting downward.\n",shift,ii); while(shift < ii) { if(--j < 0) { // Note j is signed j += 64; w64 = j>>6; mod64 = j&63; // Go to next-lower word of BIGWORD_BITMAP @@ -3097,7 +3097,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const ii -= curr_wd_bits; } } else if(shift >= (ii + curr_wd_bits) ) { - // printf("shift[%llu] >= (ii + curr_wd_bits) [%u] ... adjusting upward.\n",shift,(ii + curr_wd_bits)); + // printf("shift[%" PRIu64 "] >= (ii + curr_wd_bits) [%u] ... adjusting upward.\n",shift,(ii + curr_wd_bits)); while(shift >= (ii + curr_wd_bits) ) { if(++j > 63) { j -= 64; w64 = j>>6; mod64 = j&63; // Go to next-higher word of BIGWORD_BITMAP @@ -3156,11 +3156,11 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const first_entry = FALSE; psave = p; nsave = n; bits_per_word = (double)p/n; words_per_bit = 1.0/bits_per_word; - ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!"); - ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); - ASSERT(HERE,TRANSFORM_TYPE,"TRANSFORM_TYPE not set!"); - ASSERT(HERE,TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!"); - ASSERT(HERE,p > shift,"Specified shift count out of range!"); + ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!"); + ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); + ASSERT(TRANSFORM_TYPE,"TRANSFORM_TYPE not set!"); + ASSERT(TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!"); + ASSERT(p > shift,"Specified shift count out of range!"); nbits = 0; /* Total bits accumulated so far in the residue words processed */ @@ -3186,7 +3186,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const if(retval < 0) { // retval has not yet been set curr_wd_bits = shift - (nbits - bits[ii]); retval = ((uint64)j<<8) + curr_wd_bits; cy = cy_in; - // printf("Hit target bit %llu in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1); ASSERT(HERE, curr_wd_bits <= bits[ii]-1,"GAH!"); + // printf("Hit target bit %" PRIu64 " in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1); ASSERT(curr_wd_bits <= bits[ii]-1,"GAH!"); } #ifdef USE_AVX512 j1 = (j & mask03) + br16[j&15]; @@ -3214,7 +3214,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const } else { - ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE, "Invalid or uninited TRANSFORM_TYPE!"); + ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE, "Invalid or uninited TRANSFORM_TYPE!"); curr_wd64 = -1; curr_bit64 = 0; for(i = 0; i < 2; i++) // Two stride-2 loops to cover even and odd-indexed array elements, respectively: { @@ -3236,7 +3236,7 @@ uint64 shift_word(double a[], int n, const uint64 p, const uint64 shift, const if(retval < 0) { // retval has not yet been set curr_wd_bits = shift - (nbits - bits[ii]); retval = ((uint64)j<<8) + curr_wd_bits; cy = cy_in; - // printf("Hit target bit %llu in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1); ASSERT(HERE, curr_wd_bits <= bits[ii]-1,"GAH!"); + // printf("Hit target bit %" PRIu64 " in a[%u] (=> BIGWORD_BITMAP[%u]), bit %u of <0:%u>, bitmap-word bit = %u\n",shift,j,curr_wd64,curr_wd_bits,bits[ii]-1,curr_bit64-1); ASSERT(curr_wd_bits <= bits[ii]-1,"GAH!"); } #ifdef USE_AVX512 j1 = (j & mask03) + br16[j&15]; @@ -3300,35 +3300,35 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[] uint32 kblocks = n>>10, npad = n + ( (n >> DAT_BITS) << PAD_BITS ); // npad = length of padded data array uint64 itmp64, Res35m1, Res36m1; // Res64 from original PRP passed in via pointer; these are locally-def'd cbuf[0] = '\0'; - snprintf_nowarn(cbuf,STR_MAX_LEN,"Suyama-PRP on cofactors of %s: using FFT length %uK = %u 8-byte floats.\n",PSTRING,kblocks,n);// strcat(cbuf,cstr); -// sprintf(cstr, " this gives an average %20.15f bits per digit\n",1.0*p/n); strcat(cbuf,cstr); + snprintf(cbuf,STR_MAX_LEN*2,"Suyama-PRP on cofactors of %s: using FFT length %uK = %u 8-byte floats.\n",PSTRING,kblocks,n);// strcat(cbuf,cstr); + // sprintf(cstr, " this gives an average %20.15f bits per digit\n",1.0*p/n); strcat(cbuf,cstr); mlucas_fprint(cbuf,1); // Pepin-test output = P, vs Mersenne-PRP (type 1) residue = A; thus only need an initial mod-squaring for: // the former. Compute Fermat-PRP residue [A] from Euler-PRP (= Pepin-test) residue via a single mod-squaring: if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, ilo == p-1, "Fermat-mod cofactor-PRP test requires p-1 mod-squarings!"); - snprintf_nowarn(cbuf,STR_MAX_LEN,"Doing one mod-%s squaring of iteration-%u residue [Res64 = %016llX] to get Fermat-PRP residue\n",PSTRING,ilo,*Res64); + ASSERT(ilo == p-1, "Fermat-mod cofactor-PRP test requires p-1 mod-squarings!"); + snprintf(cbuf,STR_MAX_LEN*2,"Doing one mod-%s squaring of iteration-%u residue [Res64 = %016" PRIX64 "] to get Fermat-PRP residue\n",PSTRING,ilo,*Res64); mlucas_fprint(cbuf,1); ilo = 0; ihi = ilo+1; // Have checked that savefile residue is for a complete PRP test, so reset iteration counter BASE_MULTIPLIER_BITS[0] = 0ull; /*A*/ ierr = func_mod_square(a, (int*)ci, n, ilo,ihi, 0ull, p, scrnFlag, tdiff, TRUE, 0x0); convert_res_FP_bytewise(a, (uint8*)ci, n, p, Res64, &Res35m1, &Res36m1); // Overwrite passed-in Pepin-Res64 with Fermat-PRP one - snprintf_nowarn(cbuf,STR_MAX_LEN,"MaxErr = %10.9f\n",MME); mlucas_fprint(cbuf,1); + snprintf(cbuf,STR_MAX_LEN,"MaxErr = %10.9f\n",MME); mlucas_fprint(cbuf,1); } else if (MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { // Mersenne PRP-CF doesn't have the Res35m1 or Res36m1 values passed in, res_SH(ci,n,&itmp64,&Res35m1,&Res36m1); // so we refresh these; see https://github.com/primesearch/Mlucas/issues/27 } if(ierr) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"Error of type[%u] = %s in mod-squaring ... aborting\n",ierr,returnMlucasErrCode(ierr)); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"Error of type[%u] = %s in mod-squaring ... aborting\n",ierr,returnMlucasErrCode(ierr)); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } - sprintf(cbuf, "Fermat-PRP residue (A) = 0x%016llX,%11llu,%11llu\n",*Res64,Res35m1,Res36m1); + sprintf(cbuf, "Fermat-PRP residue (A) = %#016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 "\n",*Res64,Res35m1,Res36m1); mlucas_fprint(cbuf,1); j = (p+63)>>6; // j = uint64 vector length; Omit leading '1' bit in Fermat case since PRP-residue only has that set if a Fermat prime mi64_set_eq(ai,ci,j); // Copy packed-bit result back into low ceiling(p/64) bytes of A-vec (treated as a uint64 array) // Compute "prime-factor product residue" [B] from Euler-PRP (= Pepin-test) residue ... first init bitwise mul-by-base array = F, i.e. storing product of known small-prime factors: if(!nfac) { sprintf(cbuf, "Cofactor-PRP test requires one or more known factors!"); - mlucas_fprint(cbuf,0); ASSERT(HERE, 0, cbuf); + mlucas_fprint(cbuf,0); ASSERT(0, cbuf); } BASE_MULTIPLIER_BITS[0] = 1ull; lenf = 1; // Multiply each known-factor with current partial product of factors. @@ -3340,8 +3340,8 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[] mi64_mul_vector(BASE_MULTIPLIER_BITS,lenf, KNOWN_FACTORS+i,k, curr_fac,&lenf); mi64_set_eq(BASE_MULTIPLIER_BITS,curr_fac,lenf); } - ASSERT(HERE, (i>>2) == nfac, "Number of known-factors mismatch!"); - ASSERT(HERE, lenf <= 20, "Product of known-factors too large to fit into curr_fac[]!"); + ASSERT((i>>2) == nfac, "Number of known-factors mismatch!"); + ASSERT(lenf <= 20, "Product of known-factors too large to fit into curr_fac[]!"); for(i = 0; i < lenf; i++) { curr_fac[i] = 0ull; } // Re-zero the elts of curr_fac[] used as tmps in above loop fbits = (lenf<<6) - mi64_leadz(BASE_MULTIPLIER_BITS, lenf); // Now that have F stored in BASE_MULTIPLIER_BITS array, do powmod to get B = base^(F-1) (mod N): @@ -3349,33 +3349,33 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[] for(i = 0; i < npad; i++) { b[i] = 0; } // Zero the elements of the floating-point array b[] /****** Note: For Fermat *cofactor* PRP check we use a PRP assignment (not Pepin-test, though we need that residue as our input), meaning that PRP_BASE = 3, not the speecial value 2 it has for residue-shift purposes in Pepin test mode: ******/ - b[0] = PRP_BASE; ASSERT(HERE, PRP_BASE < (1 << (uint32)ceil(1.0*p/n)), "PRP_BASE out of range!"); + b[0] = PRP_BASE; ASSERT(PRP_BASE < (1 << (uint32)ceil(1.0*p/n)), "PRP_BASE out of range!"); ilo = 0; ihi = fbits-1; // LR modpow; init b[0] = PRP_BASE takes cares of leftmots bit RES_SHIFT = 0ull; // Zero the residue-shift so as to not have to play games with where-to-inject-the-initial-seed mi64_brev(BASE_MULTIPLIER_BITS,ihi); // bit-reverse low [ihi] bits of BASE_MULTIPLIER_BITS: /*B*/ ierr = func_mod_square(b, (int*)ci, n, ilo,ihi, 0ull, p, scrnFlag, tdiff, TRUE, 0x0); if(ierr) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"Error of type[%u] = %s on iteration %u of mod-squaring chain ... aborting\n",ierr,returnMlucasErrCode(ierr),ROE_ITER); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"Error of type[%u] = %s on iteration %u of mod-squaring chain ... aborting\n",ierr,returnMlucasErrCode(ierr),ROE_ITER); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } sprintf(cbuf,"Processed %u bits in binary modpow; MaxErr = %10.9f\n",ihi,MME); convert_res_FP_bytewise(b, (uint8*)ci, n, p, &itmp64, &Res35m1, &Res36m1); // Res64 reserved for Fermat-PRP result; use itmp64 here - sprintf(cstr, "%u^(F-1) residue (B) = 0x%016llX,%11llu,%11llu\n",PRP_BASE,itmp64,Res35m1,Res36m1); + sprintf(cstr, "%u^(F-1) residue (B) = %#016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 "\n",PRP_BASE,itmp64,Res35m1,Res36m1); strcat(cbuf,cstr); mlucas_fprint(cbuf,1); - ASSERT(HERE, j = (p+63)>>6,"uint64 vector length got clobbered!"); + ASSERT(j = (p+63)>>6,"uint64 vector length got clobbered!"); mi64_set_eq(bi,ci,j); // Copy packed-bit result into low j limbs of B-vec (treated as a uint64 array) itmp64 = mi64_sub(ai,bi, ai,j); // If result < 0, need to add Modulus - for N = Fm,Mp this means +-1 in LSW, respectively. // For Fermat case, the borrow out of the high limb in the preceding vector-sub is canceled by the // leading binary '1' in F[m]; in the Mersenne case, need to explicitly add 2^(p%64) to high limb: if(itmp64) { - ASSERT(HERE, itmp64 == 1ull,"Carryout = 1 expected!"); + ASSERT(itmp64 == 1ull,"Carryout = 1 expected!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { itmp64 = mi64_sub_scalar(ai,1ull, ai,j); ai[j-1] += 1ull << (p&63); } else { itmp64 = mi64_add_scalar(ai,1ull, ai,j); - } ASSERT(HERE, itmp64 == 0ull,"Carryout = 0 expected!"); + } ASSERT(itmp64 == 0ull,"Carryout = 0 expected!"); } // B-array again free, re-use in uint64-cast form to compute C = Fm/F and (A-B) mod C: // Compute Modulus ... note mi64-vecs have no cache-oriented element padding: @@ -3394,23 +3394,23 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[] BASE_MULTIPLIER_BITS[lenf-1] += 1ull << (fbits-1); // Restore leftmost bit ... BASE_MULTIPLIER_BITS[ 0] += 1ull; // ... and add 1 to recover F; no chance of a carryout here // Since F << N, use Mont-mul-div for C - quotient overwrites N, no rem-vec needed, just verify that F is in fact a divisor: - ASSERT(HERE, 1 == mi64_div(bi,BASE_MULTIPLIER_BITS, j,lenf, ci,0x0), "C = N/F should have 0 remainder!"); // C in ci[] + ASSERT(1 == mi64_div(bi,BASE_MULTIPLIER_BITS, j,lenf, ci,0x0), "C = N/F should have 0 remainder!"); // C in ci[] j -= (MODULUS_TYPE == MODULUS_TYPE_FERMAT); // In Fermat case, undo the above j++ used to insert the leading bit in F[m] i = j; j = mi64_getlen(ci, j); // *** Apr 2022 bug: don't add extra limb for Fermat-case to i here since (A-B) < N *** // R = (A - B) mod C in B-array (bi[]); store Q = (A - B)/C in curr_fac[] in case want to remultiply and verify Q*C + R = (A - B): - sprintf(cbuf,"(A - B) Res64 = 0x%016llX, C Res64 = 0x%016llX\n",ai[0],ci[0]); + sprintf(cbuf,"(A - B) Res64 = %#016" PRIX64 ", C Res64 = %#016" PRIX64 "\n",ai[0],ci[0]); mlucas_fprint(cbuf,1); mi64_div_binary(ai,ci, i,j, curr_fac,(uint32 *)&k, bi); // On return, k has quotient length; curr_fac[] = quo, bi[] = rem - snprintf_nowarn(cbuf,STR_MAX_LEN,"(A - B)/C: Quotient = %s, Remainder Res64 = 0x%016llX\n",&cstr[convert_mi64_base10_char(cstr,curr_fac,k,0)],bi[0]); + snprintf(cbuf,STR_MAX_LEN*2,"(A - B)/C: Quotient = %s, Remainder Res64 = %#016" PRIX64 "\n",&cstr[convert_mi64_base10_char(cstr,curr_fac,k,0)],bi[0]); mlucas_fprint(cbuf,1); // For 1-word quotient q, double-check binary-div result by computing (q*denominator + r) and comparing vs numerator: #if 0 /*** May 2022: This overwrites ci[], which hoses the is-cofactor-a-prime-power GCD() below ***/ if(k == 1) { - ASSERT(HERE, 0 == mi64_mul_scalar_add_vec2(ci, curr_fac[0], bi, ci, i), "Unexpected carryout!"); - ASSERT(HERE, 1 == mi64_cmp_eq(ai,ci,i), "Q*C + R = (A - B) check fails!"); + ASSERT(0 == mi64_mul_scalar_add_vec2(ci, curr_fac[0], bi, ci, i), "Unexpected carryout!"); + ASSERT(1 == mi64_cmp_eq(ai,ci,i), "Q*C + R = (A - B) check fails!"); } #endif - snprintf_nowarn(cbuf,STR_MAX_LEN,"Suyama Cofactor-PRP test of %s",PSTRING); + snprintf(cbuf,STR_MAX_LEN*2,"Suyama Cofactor-PRP test of %s",PSTRING); // Base-2 log of cofactor = lg(Fm/F) = lg(Fm) - lg(F) ~= 2^m - lg(F). 2^m stored in p, sub lg(F) in loop below: double lg_cof = p,lg_fac,log10_2 = 0.30102999566398119521; // Use lg_fac to store log2 of each factor as we recompute it for(i = 0; KNOWN_FACTORS[i] != 0ull; i += 4) { @@ -3428,7 +3428,7 @@ uint32 Suyama_CF_PRP(uint64 p, uint64*Res64, uint32 nfac, double a[], double b[] sprintf(cbuf,"This cofactor is PROBABLE PRIME [PRP%u].\n",i); mlucas_fprint(cbuf,1); } else { res_SH(bi,j,&itmp64,&Res35m1,&Res36m1); // Res64 reserved for Fermat-PRP result; use itmp64 here - sprintf(cstr," with FFT length %u = %u K:\n\t(A - B) mod C has Res64,35m1,36m1: 0x%016llX,%11llu,%11llu.\n",n,kblocks,itmp64,Res35m1,Res36m1); + sprintf(cstr," with FFT length %u = %u K:\n\t(A - B) mod C has Res64,35m1,36m1: %#016" PRIX64 ",%11" PRIu64 ",%11" PRIu64 ".\n",n,kblocks,itmp64,Res35m1,Res36m1); strcat(cbuf,cstr); mlucas_fprint(cbuf,1); /* Compute gcd(A - B,C) [cf. Phil Moore post: https://mersenneforum.org/showpost.php?p=210599&postcount=67] "Take the GCD of the difference of these two residues (A - B) with C. If the GCD is equal to 1, @@ -3956,9 +3956,9 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests // v18: Enable access to argc/argv outside main(): global_argv = argv; - ASSERT(HERE, (MersVec[numTest-1].fftLength != 0) && (MersVec[numTest].fftLength == 0), "numTest != MersVec allocated size!"); - ASSERT(HERE, (MvecPRP[numTest-1].fftLength != 0) && (MvecPRP[numTest].fftLength == 0), "numTest != MvecPRP allocated size!"); - ASSERT(HERE, (FermVec[numFerm-1].fftLength != 0) && (FermVec[numFerm].fftLength == 0), "numFerm != FermVec allocated size!"); + ASSERT((MersVec[numTest-1].fftLength != 0) && (MersVec[numTest].fftLength == 0), "numTest != MersVec allocated size!"); + ASSERT((MvecPRP[numTest-1].fftLength != 0) && (MvecPRP[numTest].fftLength == 0), "numTest != MvecPRP allocated size!"); + ASSERT((FermVec[numFerm-1].fftLength != 0) && (FermVec[numFerm].fftLength == 0), "numFerm != FermVec allocated size!"); /*...check that various data types are of the assumed length and do some other basic sanity checks: @@ -3979,7 +3979,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests nargs = 1; while(argv[nargs]) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); if(nargs > argc) { // == no longer applies since e.g. -prp requires no numeric arg and can come last: fprintf(stderr, "*** ERROR: Unterminated command-line option or malformed argument.\n"); print_help(); @@ -3996,7 +3996,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests if(STREQ(stFlag, "-s")) { selfTest = TRUE; - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); for(;;) { if(STREQ(stFlag, "a") || STREQ(stFlag, "all")) { /* all, which really means all the non-Huge-and-larger sets */ start = 0; finish = numTeensy + numTiny + numSmall + numMedium + numLarge; @@ -4041,11 +4041,11 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-maxalloc")) // maxalloc arg is max %-of-available-mem to use { - ASSERT(HERE, nbufSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!"); - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + ASSERT(nbufSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!"); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); darg = strtod(stFlag,&cptr); // Must be > 0: - ASSERT(HERE, (darg > 0), "maxalloc (%%-of-available-mem to use) argument must be > 0 ... halting."); + ASSERT((darg > 0), "maxalloc (%%-of-available-mem to use) argument must be > 0 ... halting."); // Max-%-of-RAM-to-use currently stored in MAX_RAM_USE ... later will multiply by (available system RAM in MB): MAX_RAM_USE = darg; maxAllocSet = TRUE; @@ -4054,11 +4054,11 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-pm1_s2_nbuf")) // pm1_s2_nbuf arg is max %-of-available-mem to use { - ASSERT(HERE, maxAllocSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!"); - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + ASSERT(maxAllocSet == FALSE, "Only one of -maxalloc and -pm1_s2_buf flags may be used!"); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); darg = strtod(stFlag,&cptr); // Must be > 0: - ASSERT(HERE, (darg > 0), "pm1_s2_nbuf argument must be integer ... halting."); + ASSERT((darg > 0), "pm1_s2_nbuf argument must be integer ... halting."); // Max-%-of-RAM-to-use currently stored in MAX_RAM_USE ... later will convert to floating-fraction and multiply by (available system RAM in MB): PM1_S2_NBUF = darg; nbufSet = TRUE; @@ -4067,16 +4067,16 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-iters")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); i64arg = atol(stFlag); // Must be < 2^32: - ASSERT(HERE, !(i64arg>>32), "#iters argument must be < 2^32 ... halting."); + ASSERT(!(i64arg>>32), "#iters argument must be < 2^32 ... halting."); iters = (uint32)i64arg; } else if(STREQ(stFlag, "-fft") || STREQ(stFlag, "-fftlen")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); // v20: default is still integer-FFT-length in Kdoubles, but add support for [float]M, // where floating-point arg must be exactly representable, such that [float]*2^10 is integer: i64arg = -1ull; @@ -4087,22 +4087,22 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(cptr,"K")) i64arg = darg; else { - ASSERT(HERE, 0, "The only non-numeric suffixes allowed for the argument to -fft are K and M"); + ASSERT(0, "The only non-numeric suffixes allowed for the argument to -fft are K and M"); } } else i64arg = darg; // Must be in range [MIN_FFT_LENGTH_IN_K,MAX_FFT_LENGTH_IN_K], def'd in Mdata.h: if(i64arg < MIN_FFT_LENGTH_IN_K || i64arg > MAX_FFT_LENGTH_IN_K) { - sprintf(cbuf , "ERROR: FFT-length argument = %llu, must be in range [%u,%u]K\n",i64arg,MIN_FFT_LENGTH_IN_K,MAX_FFT_LENGTH_IN_K); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf , "ERROR: FFT-length argument = %" PRIu64 ", must be in range [%u,%u]K\n",i64arg,MIN_FFT_LENGTH_IN_K,MAX_FFT_LENGTH_IN_K); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } fftlen = (uint32)i64arg; // Note this is the REAL-vector FFT length if((i = get_fft_radices(fftlen, 0, 0x0, 0x0, 0)) != 0) { sprintf(cbuf , "ERROR: FFT length %d K not available.\n",fftlen); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } // If user has supplied a set of complex-FFT radices, their product must equal half the real-FFT length: - if(rad_prod) { ASSERT(HERE, (rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!"); } + if(rad_prod) { ASSERT((rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!"); } } /* v19.1: Enhance the -radset flag to take either an index into the big table in get_fft_radices(), @@ -4111,7 +4111,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests set is supported and if so, set radset to the corresponding table-index numeric value: */ else if(STREQ(stFlag, "-radset")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); // Check if it's a comma-separated actual set of complex-FFT radices: char_addr = stFlag; @@ -4119,7 +4119,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests if(!cptr) { // It's a radix-set index i64arg = atol(stFlag); // Must be < 2^32: - ASSERT(HERE, i64arg < 20, "radset-index argument must be < 2^32 ... halting."); + ASSERT(i64arg < 20, "radset-index argument must be < 2^32 ... halting."); radset = (uint32)i64arg; } else { // It's a set of complex-FFT radices numrad = 0; @@ -4127,23 +4127,23 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests // Copy substring into cbuf and null-terminate: strncpy(cbuf,char_addr,(cptr-char_addr)); cbuf[cptr-char_addr] = '\0'; // Convert current radix to long and sanity-check: - i64arg = atol(cbuf); ASSERT(HERE, !(i64arg>>12), "user-supplied radices must be < 2^12 ... halting."); + i64arg = atol(cbuf); ASSERT(!(i64arg>>12), "user-supplied radices must be < 2^12 ... halting."); rvec[numrad++] = (uint32)i64arg; char_addr = cptr+1; } // A properly formatted radix-set arg will end with ',[numeric]', with the numeric in char_addr: - i64arg = atol(char_addr); ASSERT(HERE, !(i64arg>>12), "user-supplied radices must be < 2^12 ... halting."); + i64arg = atol(char_addr); ASSERT(!(i64arg>>12), "user-supplied radices must be < 2^12 ... halting."); rvec[numrad++] = (uint32)i64arg; rvec[numrad] = 0; // Null-terminate the vector just for aesthetics // Compute the radix product and make sure it's < 2^30, constraint due to the (fftlen < 2^31) one: rad_prod = 1; i64arg = 1ull; for(i = 0; i < numrad; i++) { - i64arg *= rvec[i]; ASSERT(HERE, !(i64arg>>30), "Product of complex-FFT radices supplied via -radset argument must be < 2^32 ... halting."); + i64arg *= rvec[i]; ASSERT(!(i64arg>>30), "Product of complex-FFT radices supplied via -radset argument must be < 2^32 ... halting."); } rad_prod = (uint32)i64arg; // If user has supplied a real-FFT length (in Kdoubles) via -fftlen, product of the complex-FFT radices must equal half that value: if(fftlen) { - ASSERT(HERE, (rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!"); + ASSERT((rad_prod>>9) == fftlen,"Product of user-supplied set of complex-FFT radices must equal half the real-FFT length!"); } else { fftlen = rad_prod>>9; // If user supplies fftlen via cmd-line arg after -radset, that's OK, // we'll overwrite fftlen with user-supplied value and repeat the above check then @@ -4163,55 +4163,55 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests } } // The init-value of radset -1 getting overwritten with something >= 0 means success: - ASSERT(HERE, radset >= 0, "User-supplied set of complex-FFT radices not supported."); + ASSERT(radset >= 0, "User-supplied set of complex-FFT radices not supported."); } } else if(STREQ(stFlag, "-shift")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); i64arg = atol(stFlag); // Must be < 2^32, though store in a uint64 for later bignum-upgrades: - ASSERT(HERE, !(i64arg>>32), "shift argument must be < 2^32 ... halting."); + ASSERT(!(i64arg>>32), "shift argument must be < 2^32 ... halting."); RES_SHIFT = i64arg; } // v20: Add p-1 support: else if(STREQ(stFlag, "-b1")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); i64arg = atol(stFlag); // Must be < 2^32: - ASSERT(HERE, !(i64arg>>32), "P-1 Stage 1 bound must be < 2^32 ... halting."); + ASSERT(!(i64arg>>32), "P-1 Stage 1 bound must be < 2^32 ... halting."); B1 = (uint32)i64arg; - ASSERT(HERE, testType != TEST_TYPE_PRP, "b1-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable."); + ASSERT(testType != TEST_TYPE_PRP, "b1-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable."); testType = TEST_TYPE_PM1; } else if(STREQ(stFlag, "-b2")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); // Allow Stage 2 bounds to be > 2^32: B2 = atol(stFlag); - ASSERT(HERE, testType != TEST_TYPE_PRP, "b2-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable."); + ASSERT(testType != TEST_TYPE_PRP, "b2-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable."); testType = TEST_TYPE_PM1; } else if(STREQ(stFlag, "-b2_start")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); // Allow Stage 2 bounds to be > 2^32: B2_start = atol(stFlag); - ASSERT(HERE, testType != TEST_TYPE_PRP, "b2_start-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable."); + ASSERT(testType != TEST_TYPE_PRP, "b2_start-argument implies P-1 factoring; that and PRP-test types not simultaneously specifiable."); testType = TEST_TYPE_PM1; } else if(STREQ(stFlag, "-nthread")) { #ifndef MULTITHREAD - ASSERT(HERE,0,"Multithreading must be enabled in build to permit -nthread argument!"); + ASSERT(0,"Multithreading must be enabled in build to permit -nthread argument!"); #else - ASSERT(HERE,cpu == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!"); - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + ASSERT(cpu == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!"); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); i64arg = atol(stFlag); // Must be < 2^32: - ASSERT(HERE, !(i64arg>>32), "nthread argument must be < 2^32 ... halting."); + ASSERT(!(i64arg>>32), "nthread argument must be < 2^32 ... halting."); NTHREADS = (uint32)i64arg; nthread = TRUE; // Use the same affinity-setting code here as for the -cpu option, but simply for cores [0:NTHREADS-1]: @@ -4223,10 +4223,10 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-cpu")) { #ifndef MULTITHREAD - ASSERT(HERE,0,"Multithreading must be enabled in build to permit -cpu argument!"); + ASSERT(0,"Multithreading must be enabled in build to permit -cpu argument!"); #else - ASSERT(HERE,nthread == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!"); - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + ASSERT(nthread == FALSE && core == FALSE,"Only one of -nthread, -cpu and -core flags permitted!"); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); parseAffinityString(stFlag); cpu = TRUE; #endif @@ -4236,10 +4236,10 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-core")) { #ifndef MULTITHREAD - ASSERT(HERE,0,"Multithreading must be enabled in build to permit -core argument!"); + ASSERT(0,"Multithreading must be enabled in build to permit -core argument!"); #else - ASSERT(HERE,cpu == FALSE && nthread == FALSE,"Only one of -nthread, -cpu and -core flags permitted!"); - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + ASSERT(cpu == FALSE && nthread == FALSE,"Only one of -nthread, -cpu and -core flags permitted!"); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); NTHREADS = parseAffinityTriplet(stFlag,TRUE); // 2nd-arg = TRUE: Use hwloc-generated topology, via '-core lo:hi[:threads_per_core]' if(NTHREADS > MAX_THREADS) { fprintf(stderr,"ERROR: NTHREADS [ = %d] must not exceed those of available logical cores = 0-%d!\n",NTHREADS,MAX_THREADS-1); @@ -4256,7 +4256,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-m") || STREQ(stFlag, "-mersenne")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); expo = atol(stFlag); userSetExponent = 1; // Use 0-pad slot in MvecPtr[] to store user-set-exponent data - that can point to either MersVec @@ -4270,19 +4270,19 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-prp")) // This flag optionally takes a numeric base arg, and trips us into PRP-test mode { if(nargs < argc) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); if(isdigit(stFlag[0])) { PRP_BASE = atol(stFlag); if(PRP_BASE+1 == 0) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "*** ERROR: Numeric arg to -prp flag, '%s', overflows uint32 field.\n", stFlag); - ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "*** ERROR: Numeric arg to -prp flag, '%s', overflows uint32 field.\n", stFlag); + ASSERT(0,cbuf); } } else --nargs; } // Use 0-pad slot in MvecPRP[] to store user-set-exponent data: - ASSERT(HERE,MvecPtr == MersVec,"-prp flag invoked, but MvecPtr does not reflect the default MersVec init-value!"); + ASSERT(MvecPtr == MersVec,"-prp flag invoked, but MvecPtr does not reflect the default MersVec init-value!"); MvecPtr = MvecPRP; if(MersVec[numTest].exponent) { MvecPtr[numTest].exponent = MersVec[numTest].exponent; @@ -4296,17 +4296,17 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(STREQ(stFlag, "-base")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); i64arg = atol(stFlag); PRP_BASE = (uint32)i64arg; } else if(STREQ(stFlag, "-f") || STREQ(stFlag, "-fermat")) { - strncpy(stFlag, argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, argv[nargs++], STR_MAX_LEN-1); i64arg = atol(stFlag); // Must be < 2^32: - ASSERT(HERE, !(i64arg>>32), "Fermat-number-index argument must be < 2^32 ... halting."); + ASSERT(!(i64arg>>32), "Fermat-number-index argument must be < 2^32 ... halting."); findex = (uint32)i64arg; /* Make sure the Fermat number index is in range: */ if(findex < 13 || findex > 63) { @@ -4328,7 +4328,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests // Nov 2020: Sanity-check any p-1 bounds: if(testType == TEST_TYPE_PM1) { - ASSERT(HERE, (modType == MODULUS_TYPE_MERSENNE || modType == MODULUS_TYPE_FERMAT) && userSetExponent, "P-1 in command-line mode requires a Mersenne or Fermat-number modulus to be specified via '-m [int]' or '-f [int]'."); + ASSERT((modType == MODULUS_TYPE_MERSENNE || modType == MODULUS_TYPE_FERMAT) && userSetExponent, "P-1 in command-line mode requires a Mersenne or Fermat-number modulus to be specified via '-m [int]' or '-f [int]'."); pm1_check_bounds(); } @@ -4355,7 +4355,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests if(iarg == 0) { sprintf(cbuf , "*** ERROR: Must specify a valid FFT length on command line before -radset argument!\n"); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Make sure it's a valid radix set index for this FFT length: */ @@ -4367,7 +4367,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else sprintf(cbuf , "ERROR: Unknown error-code value %d from get_fft_radices(), called with radix set index %d, FFT length %d K\n",i,radset, iarg); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } @@ -4379,15 +4379,15 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests if(modType == MODULUS_TYPE_MERSENNE && !selfTest) { if(userSetExponent) { - ASSERT(HERE, start > 0, "userSetExponent = TRUE but self-test starting-index unset!"); + ASSERT(start > 0, "userSetExponent = TRUE but self-test starting-index unset!"); sprintf(cbuf, "ERROR: Production-run-mode [-iters not invoked] does not allow command-line\nsetting of exponent - that must be read from the %s file.\n",WORKFILE); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } else if(start == -1) { start = numTest; finish = start+1; } if(radset != -1) { sprintf(cbuf, "ERROR: Production-run-mode [-iters not invoked] allows command-line setting of\nFFT length, but not the radix set - that must be read from the mlucas.cfg file.\n"); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } ERNST_MAIN: if((retVal = ernstMain(modType,testType,0,MvecPtr[start].fftLength,0,0,0,&Res64,&Res35m1,&Res36m1,scrnFlag,&runtime)) != 0) @@ -4401,7 +4401,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests k = (uint32)(retVal >> 8); if((i = get_fft_radices(k, 0, 0x0, 0x0, 0)) != 0) { sprintf(cbuf, "ERROR: FFT length %d K not available.\n",k); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /**** IF POSSIBLE, USE ONE OF THE STANDARD TEST EXPONENTS HERE, SO CAN CHECK RES64s!!! ****/ @@ -4424,7 +4424,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests } /* ...Otherwise barf. */ else { - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } else { fprintf(stderr, "\n Done ...\n\n"); @@ -4437,8 +4437,8 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests if(MvecPtr[start].exponent == 0) { i = MvecPtr[start].fftLength; - ASSERT(HERE, i > 0 ,"Require i > 0 "); - ASSERT(HERE, i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K"); + ASSERT(i > 0 ,"Require i > 0 "); + ASSERT(i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K"); // If FFT length is not represented in reference-residue array, find nearest prime <= 0.99*given_N_get_maxP(FFT length): for(j = 0; j < numTest; j++) { @@ -4454,8 +4454,8 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests } } if(expo < lo || lo >= hi) { - fprintf(stderr, "ERROR: unable to find a prime in the interval %llu <= x <= %llu.\n", lo, hi); - ASSERT(HERE, 0,"0"); + fprintf(stderr, "ERROR: unable to find a prime in the interval %" PRIu64 " <= x <= %" PRIu64 ".\n", lo, hi); + ASSERT(0,"0"); } } else { /* Use the corresponding entry of MvecPtr: */ start = j; finish = start+1; @@ -4469,8 +4469,8 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests { if(FermVec[start].Fidx == 0) { i = FermVec[start].fftLength; - ASSERT(HERE, i > 0 ,"Require i > 0 "); - ASSERT(HERE, i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K"); + ASSERT(i > 0 ,"Require i > 0 "); + ASSERT(i <=MAX_FFT_LENGTH_IN_K,"Require i <=MAX_FFT_LENGTH_IN_K"); if(i > FermVec[numFerm-1].fftLength) /* Computing a new-largest entry? */ FermVec[numFerm].Fidx = (i << 4); @@ -4483,7 +4483,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests } if(lo >= numFerm) { fprintf(stderr, "ERROR: unable to find FFT length %d K in the Reference Residue table.\n", i); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } } @@ -4491,7 +4491,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests else if(findex && (FermVec[numFerm].fftLength == 0)) FermVec[numFerm].fftLength = get_default_fft_length((uint64)1 << findex); } else{ - ASSERT(HERE, 0,"modType not recognized!"); + ASSERT(0,"modType not recognized!"); } TIMING_TEST_LOOP: @@ -4621,7 +4621,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests if(iters == 100 || iters == 1000 || iters == 10000) { mvec_res_t_idx = NINT( log((double)iters)/log(10.) ) - 2; /* log10(iters) - 2, use slower NINT rather than DNINT here since latter needs correct rounding mode */ - ASSERT(HERE, mvec_res_t_idx < 3,"main: mvec_res_t_idx out of range!"); + ASSERT(mvec_res_t_idx < 3,"main: mvec_res_t_idx out of range!"); // Use empty-data-slot at top of MersVec[] or MvecPRP[], respectively, for primality & prp single-case tests: if( (modType == MODULUS_TYPE_MERSENNE && MvecPtr[xNum].res_t[mvec_res_t_idx].sh0 == 0) || (modType == MODULUS_TYPE_FERMAT && FermVec[xNum].res_t[mvec_res_t_idx].sh0 == 0) ) @@ -4672,7 +4672,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests retVal = ernstMain(modType,testType,(uint64)MvecPtr[xNum].exponent,iarg,radix_set,maxFFT,iters,&Res64,&Res35m1,&Res36m1,scrnFlag,&runtime); } else - ASSERT(HERE, 0,"Unsupported modulus and/or test type!"); + ASSERT(0,"Unsupported modulus and/or test type!"); // (retVal != 0) relates to dangerously high ROEs, use maxErr to decide whether to accept radix set. /*** (to-do: factor in #occurrences) ***/ @@ -4774,7 +4774,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests fp = mlucas_fopen(CONFIGFILE,FILE_ACCESS_MODE); if(!fp) { sprintf(cbuf , "INFO: Unable to open %s file in %s mode ... \n", CONFIGFILE, FILE_ACCESS_MODE); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Put code version on line 1. @@ -4799,7 +4799,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests */ if(get_fft_radices(iarg, radix_best, &NRADICES, RADIX_VEC, 10) != 0) { sprintf(cbuf , "ERROR: alleged best-radix-set index %u is unsupported.\n",radix_best); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Zero-pad the radices-printing to the full length of the RADIX_VEC array so each line has same length (needed to allow update mode): @@ -4808,7 +4808,7 @@ just below the upper limit for each FFT lengh in some subrange of the self-tests /* If it's a new self-test residue being computed, add the SH residues to the .cfg file line */ if(new_data) - fprintf(fp, "\tp = %s: %d-iter Res mod 2^64, 2^35-1, 2^36-1 = %016llX, %11.0f, %11.0f",ESTRING,iters,new_res.sh0,(double)new_res.sh1,(double)new_res.sh2); + fprintf(fp, "\tp = %s: %d-iter Res mod 2^64, 2^35-1, 2^36-1 = %016" PRIX64 ", %11.0f, %11.0f",ESTRING,iters,new_res.sh0,(double)new_res.sh1,(double)new_res.sh2); fprintf(fp,"\n"); fclose(fp); fp = 0x0; @@ -4842,10 +4842,10 @@ uint64 parse_cmd_args_get_shift_value(void) int i, nargs = 1; while(global_argv[nargs]) { - strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN-1); if(STREQ(stFlag, "-shift")) { - strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN); + strncpy(stFlag, global_argv[nargs++], STR_MAX_LEN-1); /* Convert the shift argument to a uint64: */ i64arg = 0; for(i = 0; i < STR_MAX_LEN && stFlag[i] != '\0'; i++) { @@ -4854,12 +4854,12 @@ uint64 parse_cmd_args_get_shift_value(void) /* Check for overflow: */ if(i64arg % (uint64)10 != (uint64)(stFlag[i]-CHAROFFSET)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "*** ERROR: -shift argument %s overflows uint64 field.\n", stFlag); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "*** ERROR: -shift argument %s overflows uint64 field.\n", stFlag); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "*** ERROR: Non-numeric character encountered in -shift argument %s.\n", stFlag); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "*** ERROR: Non-numeric character encountered in -shift argument %s.\n", stFlag); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } } @@ -4899,7 +4899,7 @@ int cfgNeedsUpdating(char*in_line) const char*returnMlucasErrCode(uint32 ierr) { - ASSERT(HERE, ierr < ERR_MAX, "Error code out of range!"); + ASSERT(ierr < ERR_MAX, "Error code out of range!"); return err_code[ierr-1]; } @@ -4918,7 +4918,7 @@ void printMlucasErrCode(uint32 ierr) /* High bytes should only be nonzero if low byte == ERR_RUN_SELFTEST_FORLENGTH: */ if((ierr>>8) != 0) { - ASSERT(HERE, i==ERR_RUN_SELFTEST_FORLENGTH, "High bytes should only be nonzero if low byte == ERR_RUN_SELFTEST_FORLENGTH!"); + ASSERT(i==ERR_RUN_SELFTEST_FORLENGTH, "High bytes should only be nonzero if low byte == ERR_RUN_SELFTEST_FORLENGTH!"); } } @@ -5075,7 +5075,7 @@ int read_ppm1_residue(const uint32 nbytes, FILE*fp, uint8 arr_tmp[], uint64*Res6 for(i = nbytes; i < nbytes+j; i++) arr_tmp[i] = 0; itmp64 = ((uint64*)arr_tmp)[0]; if(*Res64 != itmp64) { - sprintf(cbuf, "%s: On restart: Res64 checksum error! Got %llX, expected %llX\n" ,func,itmp64,*Res64); return 0; + sprintf(cbuf, "%s: On restart: Res64 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n" ,func,itmp64,*Res64); return 0; } // For big-endian CPUs, casting byte-array to uint64* gives byte-reversed limbs, so use a direct bitwise mod: #ifdef USE_BIG_ENDIAN @@ -5115,17 +5115,17 @@ int read_ppm1_residue(const uint32 nbytes, FILE*fp, uint8 arr_tmp[], uint64*Res6 MOD_ADD64(bmod35,29,35,bmod35); MOD_ADD64(bmod36,28,36,bmod36); // bmod35|36 += 29|28 (mod 35|36) } rmod35 = (rmod35 & two35m1) + (rmod35 >> 35); rmod36 = (rmod36 & two36m1) + (rmod36 >> 36); // And do a final pair of folds to get mods - if(*Res35m1 != rmod35) { sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %llX, expected %llX\n",func,rmod35,*Res35m1); return 0; } - if(*Res36m1 != rmod36) { sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %llX, expected %llX\n",func,rmod36,*Res36m1); return 0; } + if(*Res35m1 != rmod35) { sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,rmod35,*Res35m1); return 0; } + if(*Res36m1 != rmod36) { sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,rmod36,*Res36m1); return 0; } #else i = (nbytes+7)>>3; // # of 64-bit limbs itmp64 = mi64_div_by_scalar64((uint64*)arr_tmp,two35m1,i,0x0); if(*Res35m1 != itmp64) { - sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %llX, expected %llX\n",func,itmp64,*Res35m1); return 0; + sprintf(cbuf, "%s: On restart: Res35m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,itmp64,*Res35m1); return 0; } itmp64 = mi64_div_by_scalar64((uint64*)arr_tmp,two36m1,i,0x0); if(*Res36m1 != itmp64) { - sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %llX, expected %llX\n",func,itmp64,*Res36m1); return 0; + sprintf(cbuf, "%s: On restart: Res36m1 checksum error! Got %" PRIX64 ", expected %" PRIX64 "\n",func,itmp64,*Res36m1); return 0; } #endif return 1; @@ -5144,9 +5144,9 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin uint128 ui128,vi128; uint192 ui192,vi192; uint256 ui256,vi256; // Fixed-length 2/3/4-word ints for stashing results of multiword modexp. *Res64 = 0ull; // 0 value on return indicates failure of some kind mi64_clear(pow,4); mi64_clear(rem,4); - ASSERT(HERE, arr1 != 0x0, "Null arr1 pointer!"); + ASSERT(arr1 != 0x0, "Null arr1 pointer!"); if(!file_valid(fp)) { - sprintf(cbuf, "%s: File pointer invalid for read!\n",func); ASSERT(HERE, 0, cbuf); + sprintf(cbuf, "%s: File pointer invalid for read!\n",func); ASSERT(0, cbuf); } fprintf(stderr, " INFO: restart file %s found...reading...\n",fname); /* t: */ @@ -5170,9 +5170,9 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin if(TEST_TYPE == TEST_TYPE_PM1) { if(strstr(fname, ".s2")) { if(nsquares > 0xFFFFFFFFull) - ASSERT(HERE, B2_start <= nsquares, "P-1 stage 2 restart requires (B2_start in worktodo assignment) <= (savefile nsquares field)!"); + ASSERT(B2_start <= nsquares, "P-1 stage 2 restart requires (B2_start in worktodo assignment) <= (savefile nsquares field)!"); } else { // It's a stage 1 restart: - ASSERT(HERE, nsquares <= 0xFFFFFFFFull && nsquares < 1.5*(double)B1, "P-1 stage 1 restart: savefile nsquares value out of bounds!"); + ASSERT(nsquares <= 0xFFFFFFFFull && nsquares < 1.5*(double)B1, "P-1 stage 1 restart: savefile nsquares value out of bounds!"); } // If S2 restart and (nsquares > B2_start), read the ensuing S2 interim residue; if (nsquares == B2_start) // it means S2 started but was aborted for some reason before writing an interim S2 residue. That will set @@ -5180,10 +5180,10 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin // the S2 code interprets as "start stage 2 from B2_start." } else { // For primality-tests, make sure nsquares < 2^32 and copy to ilo: if(nsquares > p) { // v21: change from >= p to > p, since Mersenne-PRP restart-to-check-CF will have nsquares == p: - sprintf(cbuf,"%s: nsquares = %llu out of range, should be < p = %llu\n",func, nsquares, p); + sprintf(cbuf,"%s: nsquares = %" PRIu64 " out of range, should be < p = %" PRIu64 "\n",func, nsquares, p); return 0; } else if(nsquares > 0xFFFFFFFFull) { - sprintf(cbuf,"%s: nsquares = %llu out of range, current limit = 2^32-1.\n",func, nsquares); + sprintf(cbuf,"%s: nsquares = %" PRIu64 " out of range, current limit = 2^32-1.\n",func, nsquares); return 0; } } @@ -5195,7 +5195,7 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin TRANSFORM_TYPE = REAL_WRAPPER; } else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { sprintf(cbuf, "%s: MODULUS_TYPE_FERMAT but (p mod 8) != 0",func); - ASSERT(HERE, (p & 7) == 0,cbuf); + ASSERT((p & 7) == 0,cbuf); nbytes = (p>>3) + 1; TRANSFORM_TYPE = RIGHT_ANGLE; } @@ -5243,10 +5243,10 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin (Also added CRT routine to nt_utils.txt, which takes the 3 remainders mod the known prime factors and confirms the DIV result.) */ if(TEST_TYPE == TEST_TYPE_PRP) { - len = (nbytes+7)>>3; j = p&63; itmp64 = avec[len-1]; ASSERT(HERE, (itmp64 >> j) == 0ull, "High limb of residue array1 does not have upper bits cleared!"); + len = (nbytes+7)>>3; j = p&63; itmp64 = avec[len-1]; ASSERT((itmp64 >> j) == 0ull, "High limb of residue array1 does not have upper bits cleared!"); for(i = 0; KNOWN_FACTORS[i] != 0ull; i += 4) { j = mi64_getlen(KNOWN_FACTORS+i,4); // j = number of nonzero limbs in curr_fac (alloc 4 limbs per in KNOWN_FACTORS[]) - sprintf(cstr,"Computing %llu-squaring residue R (mod known prime q = %s)\n",nsquares,&cbuf[convert_mi64_base10_char(cbuf, KNOWN_FACTORS+i, j, 0)] ); mlucas_fprint(cstr,1); + sprintf(cstr,"Computing %" PRIu64 "-squaring residue R (mod known prime q = %s)\n",nsquares,&cbuf[convert_mi64_base10_char(cbuf, KNOWN_FACTORS+i, j, 0)] ); mlucas_fprint(cstr,1); mi64_div(avec,KNOWN_FACTORS+i, len,j, 0x0,rem); // R (mod p) returned in rem[] k = mi64_getlen(rem,4); // j = number of nonzero limbs in remainder sprintf(cstr,"\tA: R == %s (mod q)\n",&cbuf[convert_mi64_base10_char(cbuf, rem, k, 0)] ); mlucas_fprint(cstr,1); @@ -5268,13 +5268,13 @@ int read_ppm1_savefiles(const char*fname, uint64 p, uint32*kblocks, FILE*fp, uin ui256 = twopmmodq256(vi256,ui256); // pow' = 2^nsquares (mod p-1) exp[0] = ui256.d0; exp[1] = ui256.d1; exp[2] = ui256.d2; exp[3] = ui256.d3; } else - ASSERT(HERE, 0, "Only known-factors < 2^256 supported!"); + ASSERT(0, "Only known-factors < 2^256 supported!"); // Raise PRP base (usually but not always 3) to the just-computed power; result in 4-limb local-array pow[]: mi64_scalar_modpow_lr(PRP_BASE, exp, KNOWN_FACTORS+i, j, pow); sprintf(cstr,"\tB: R == %s (mod q)\n",&cbuf[convert_mi64_base10_char(cbuf, pow, j, 0)] ); mlucas_fprint(cstr,1); if (mi64_getlen(pow,4) != k || !mi64_cmp_eq(pow,rem,k)) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"Full-residue == %u^nsquares (mod q) check fails!", PRP_BASE); mlucas_fprint(cbuf,0); - ASSERT(HERE, 0, cbuf); + snprintf(cbuf,STR_MAX_LEN,"Full-residue == %u^nsquares (mod q) check fails!", PRP_BASE); mlucas_fprint(cbuf,0); + ASSERT(0, cbuf); } } } @@ -5311,7 +5311,7 @@ Thus if we use a negative-power algo, to recover 2^p (mod q = 2^k.qodd): // v19: For PRP-tests, also read a second Gerbicz-check residue array [arr2] and associated S-H checksum triplet [i1,i2,i3]: if(DO_GCHECK) { // v21: Change to key off DO_GCHECK, to allow Fermat-mod Pepin-tests to use the Gerbicz check, too - ASSERT(HERE, arr2 != 0x0, "Null arr2 pointer!"); + ASSERT(arr2 != 0x0, "Null arr2 pointer!"); PRP_BASE = 0ull; for(j = 0; j < 4; j++) { i = fgetc(fp); PRP_BASE += i << (8*j); @@ -5367,8 +5367,8 @@ void write_ppm1_residue(const uint32 nbytes, FILE*fp, const uint8 arr_tmp[], con i = fwrite(arr_tmp, sizeof(char), nbytes, fp); if(i != nbytes) { fclose(fp); fp = 0x0; - snprintf_nowarn(cbuf,STR_MAX_LEN,"%s: Error writing residue to restart file.\n",func); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"%s: Error writing residue to restart file.\n",func); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } /* ...and checksums: */ /* Res64: */ @@ -5388,10 +5388,10 @@ void write_ppm1_savefiles(const char*fname, uint64 p, int n, FILE*fp, uint64 ihi uint8 arr2[], uint64 i1 , uint64 i2 , uint64 i3 ) { uint32 i,kblocks,nbytes = 0; - ASSERT(HERE,file_valid(fp),"write_ppm1_savefiles: File pointer invalid for write!"); + ASSERT(file_valid(fp),"write_ppm1_savefiles: File pointer invalid for write!"); // Make sure n is a proper (unpadded) FFT-length, i.e. is a multiple of 1K: kblocks = (n >> 10); - ASSERT(HERE,n == (kblocks << 10),"Not a proper unpadded FFT length"); + ASSERT(n == (kblocks << 10),"Not a proper unpadded FFT length"); /* See the function read_ppm1_savefiles() for the file format here: */ /* t: */ @@ -5407,7 +5407,7 @@ void write_ppm1_savefiles(const char*fname, uint64 p, int n, FILE*fp, uint64 ihi nbytes = (p + 7)/8; TRANSFORM_TYPE = REAL_WRAPPER; } else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, p % 8 == 0,"write_ppm1_savefiles: p % 8 == 0"); + ASSERT(p % 8 == 0,"write_ppm1_savefiles: p % 8 == 0"); nbytes = (p/8) + 1; // We don't expect > p bits except in the highly unlikely case of a prime-Fermat Pepin-test result TRANSFORM_TYPE = RIGHT_ANGLE; } @@ -5460,27 +5460,27 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const uint64 curr_word, curr_wd64; int pow2_fft; - ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!"); - ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); + ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!"); + ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); - ASSERT(HERE,TRANSFORM_TYPE,"TRANSFORM_TYPE not set!"); - ASSERT(HERE,TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!"); + ASSERT(TRANSFORM_TYPE,"TRANSFORM_TYPE not set!"); + ASSERT(TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!"); /* Set the number of residue bytes, which is the same for Mersenne (2^p-1) and Fermat-mod (2^p+1, with p = 2^findex) despite the fact the latter can formally be as large as 2^p, since only ever hit that if it`s the last residue of a Pepin test and the number hqppens to be prime. (We would love for that exception to break some other ASSERTion in the code): */ if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { - ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_bytewise_FP: TRANSFORM_TYPE == REAL_WRAPPER"); + ASSERT(TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_bytewise_FP: TRANSFORM_TYPE == REAL_WRAPPER"); } else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_bytewise_FP: TRANSFORM_TYPE == RIGHT_ANGLE"); + ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_bytewise_FP: TRANSFORM_TYPE == RIGHT_ANGLE"); /* If Fermat number, make sure exponent a power of 2: */ findex = trailz64(p); - ASSERT(HERE, (p >> findex) == 1,"convert_res_bytewise_FP: (p >> findex) == 1"); + ASSERT((p >> findex) == 1,"convert_res_bytewise_FP: (p >> findex) == 1"); - ASSERT(HERE, p % 8 == 0,"convert_res_bytewise_FP: p % 8 == 0"); + ASSERT(p % 8 == 0,"convert_res_bytewise_FP: p % 8 == 0"); } nbytes = (p + 7)/8; // Apply the circular shift: @@ -5490,7 +5490,7 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const /* Vector length a power of 2? */ pow2_fft = (n >> trailz32(n)) == 1; - bits[0] = p/n; ASSERT(HERE, bits[0] > 1,"convert_res_bytewise_FP: bits[0] > 1"); + bits[0] = p/n; ASSERT(bits[0] > 1,"convert_res_bytewise_FP: bits[0] > 1"); base[0] = 1 << bits[0]; if(MODULUS_TYPE == MODULUS_TYPE_FERMAT && pow2_fft == TRUE) @@ -5542,7 +5542,7 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const if(rbits < bits[ii]) { itmp = curr_wd64; - ASSERT(HERE, itmp < (1ull<= 2^rbits!"); + ASSERT(itmp < (1ull<= 2^rbits!"); /* Now grab the next 64 bits of the bytewise residue... */ curr_wd64 = 0; @@ -5619,7 +5619,7 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const if(rbits < bits[ii]) { itmp = curr_wd64; - ASSERT(HERE, itmp < (1<= 2^rbits!"); + ASSERT(itmp < (1<= 2^rbits!"); /* Now grab the next 64 bits of the bytewise residue... */ curr_wd64 = 0; @@ -5662,9 +5662,9 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const } } - ASSERT(HERE, curr_char == nbytes, "convert_res_bytewise_FP: curr_char == (p+7)/8"); - ASSERT(HERE, nbits == p ,"convert_res_bytewise_FP: nbits == p "); - ASSERT(HERE, curr_wd64 == 0,"convert_res_bytewise_FP: curr_word == 0"); + ASSERT(curr_char == nbytes, "convert_res_bytewise_FP: curr_char == (p+7)/8"); + ASSERT(nbits == p ,"convert_res_bytewise_FP: nbits == p "); + ASSERT(curr_wd64 == 0,"convert_res_bytewise_FP: curr_word == 0"); /* Fold any carryout from the conversion to balanced-representation form @@ -5674,8 +5674,8 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const /* Should have carryout of +1 Iff MS word < 0; otherwise expect 0 carry: */ if(cy && (a[j1] >= 0 || cy != +1)) { - sprintf(cbuf, "convert_res_bytewise_FP: Illegal combination of nonzero carry = %lld, most sig. word = %20.4f\n", cy, a[j]); - ASSERT(HERE, 0, cbuf); + sprintf(cbuf, "convert_res_bytewise_FP: Illegal combination of nonzero carry = %" PRId64 ", most sig. word = %20.4f\n", cy, a[j]); + ASSERT(0, cbuf); } if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -5683,7 +5683,7 @@ int convert_res_bytewise_FP(const uint8 ui64_arr_in[], double a[], int n, const else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) a[0] -= cy; else - ASSERT(HERE, 0,"Illegal modulus type!"); + ASSERT(0,"Illegal modulus type!"); return TRUE; } @@ -5712,22 +5712,22 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons const uint64 two35m1 = (uint64)0x00000007FFFFFFFFull, two36m1 = (uint64)0x0000000FFFFFFFFFull; /* 2^35,36-1 */ uint64*u64_ptr = (uint64*)ui64_arr_out; - ASSERT(HERE,MODULUS_TYPE,"MODULUS_TYPE not set!"); - ASSERT(HERE,MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); - ASSERT(HERE,TRANSFORM_TYPE,"TRANSFORM_TYPE not set!"); - ASSERT(HERE,TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!"); + ASSERT(MODULUS_TYPE,"MODULUS_TYPE not set!"); + ASSERT(MODULUS_TYPE <= MODULUS_TYPE_MAX,"MODULUS_TYPE out of range!"); + ASSERT(TRANSFORM_TYPE,"TRANSFORM_TYPE not set!"); + ASSERT(TRANSFORM_TYPE <= TRANSFORM_TYPE_MAX,"TRANSFORM_TYPE out of range!"); /* If Fermat number, make sure exponent a power of 2: */ if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_FP_bytewise: TRANSFORM_TYPE == RIGHT_ANGLE"); + ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE,"convert_res_FP_bytewise: TRANSFORM_TYPE == RIGHT_ANGLE"); findex = trailz64(p); - ASSERT(HERE, (p >> findex) == 1,"convert_res_FP_bytewise: (p >> findex) == 1"); + ASSERT((p >> findex) == 1,"convert_res_FP_bytewise: (p >> findex) == 1"); } else if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) - ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_FP_bytewise: TRANSFORM_TYPE == REAL_WRAPPER"); + ASSERT(TRANSFORM_TYPE == REAL_WRAPPER,"convert_res_FP_bytewise: TRANSFORM_TYPE == REAL_WRAPPER"); else - ASSERT(HERE, 0,"Illegal modulus type!"); + ASSERT(0,"Illegal modulus type!"); /* Vector length a power of 2? */ pow2_fft = (n >> trailz32(n)) == 1; @@ -5839,7 +5839,7 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons atmp = a[j1]; if(atmp != NINT(atmp)) { sprintf(cbuf,"%s: Input float-residue elements must have 0 fractional part! A[%u (of %u)] = %20.10f",func,j,n,atmp); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } itmp = (int64)(atmp+ cy); /* current digit in int64 form, subtracting any borrow from the previous digit. */ if(itmp < 0) { /* If current digit < 0, add the current base and set carry into next-higher digit = -1 */ @@ -5848,11 +5848,11 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons } else { cy = 0; } - ASSERT(HERE, itmp >= 0,"convert_res_FP_bytewise: itmp >= 0"); + ASSERT(itmp >= 0,"convert_res_FP_bytewise: itmp >= 0"); /* Update 8-byte residue buffer last, since this one modifies itmp: */ - ASSERT(HERE, rbits < 8,"convert_res_FP_bytewise: rbits < 8"); - ASSERT(HERE, curr_wd64 < (1<= 2^rbits!"); + ASSERT(rbits < 8,"convert_res_FP_bytewise: rbits < 8"); + ASSERT(curr_wd64 < (1<= 2^rbits!"); itmp = (itmp << rbits) + curr_wd64; curr_bits = bits[ii] + rbits; @@ -5899,7 +5899,7 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons atmp = a[j1]; if(atmp != NINT(atmp)) { sprintf(cbuf,"%s: Input float-residue elements must have 0 fractional part! A[%u (of %u) = %20.10f] = ",func,j,n,atmp); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } itmp = (int64)(atmp+ cy); /* current digit in int64 form, subtracting any borrow from the previous digit. */ if(itmp < 0) { /* If current digit < 0, add the current base and set carry into next-higher digit = -1 */ @@ -5908,11 +5908,11 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons } else { cy = 0; } - ASSERT(HERE, itmp >= 0,"convert_res_FP_bytewise: itmp >= 0"); + ASSERT(itmp >= 0,"convert_res_FP_bytewise: itmp >= 0"); /* Update 8-byte residue buffer last, since this one modifies itmp: */ - ASSERT(HERE, rbits < 8,"convert_res_FP_bytewise: rbits < 8"); - ASSERT(HERE, curr_wd64 < (1<= 2^rbits!"); + ASSERT(rbits < 8,"convert_res_FP_bytewise: rbits < 8"); + ASSERT(curr_wd64 < (1<= 2^rbits!"); itmp = (itmp << rbits) + curr_wd64; curr_bits = bits[ii] + rbits; @@ -5932,18 +5932,18 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons if(cy && (!msw_lt0 || cy != -1)) { sprintf(cbuf, "convert_res_FP_bytewise: Illegal combination of nonzero carry = %d, msw_lt0 = %d\n", cy, msw_lt0); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } /* Residue should contain ceiling(p/8) bytes: */ - ASSERT(HERE, rbits < 8, "rbits >= 8"); + ASSERT(rbits < 8, "rbits >= 8"); if(rbits) { - ASSERT(HERE, curr_wd64 < (1<= 2^rbits!"); + ASSERT(curr_wd64 < (1<= 2^rbits!"); ui64_arr_out[curr_char++] = curr_wd64 & 255; curr_wd64 >>= 8; } - ASSERT(HERE, curr_char == (p+7)/8,"convert_res_FP_bytewise: curr_char == (p+7)/8"); - ASSERT(HERE, nbits == p ,"convert_res_FP_bytewise: nbits == p "); - ASSERT(HERE, curr_wd64 == 0 ,"convert_res_FP_bytewise: curr_wd64 == 0 "); + ASSERT(curr_char == (p+7)/8,"convert_res_FP_bytewise: curr_char == (p+7)/8"); + ASSERT(nbits == p ,"convert_res_FP_bytewise: nbits == p "); + ASSERT(curr_wd64 == 0 ,"convert_res_FP_bytewise: curr_wd64 == 0 "); // Remove the circular shift ... have no mi64_shrc function, so use that b-bit rightward cshift equivalent to (p-b)-bit left-cshift. // (But must guard against RES_SHIFT = 0, since in that case the left-shift count == p and mi64_shlc requires shift count strictly < p): @@ -5953,7 +5953,7 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons ***/ j = (p+63)>>6; // # of 64-bit limbs if(RES_SHIFT) { - // fprintf(stderr,"convert_res_FP_bytewise: removing shift = %llu\n",RES_SHIFT); + // fprintf(stderr,"convert_res_FP_bytewise: removing shift = %" PRIu64 "\n",RES_SHIFT); uint32 sign_flip = (MODULUS_TYPE == MODULUS_TYPE_FERMAT); mi64_shlc(u64_ptr, u64_ptr, p, p-RES_SHIFT,j,sign_flip); // If current residue R needed a sign-flip - again, this can only happen in the Fermat-mod case - @@ -5968,7 +5968,7 @@ void convert_res_FP_bytewise(const double a[], uint8 ui64_arr_out[], int n, cons if(Res64 ) *Res64 = ((uint64*)ui64_arr_out)[0]; if(Res35m1) *Res35m1 = mi64_div_by_scalar64((uint64*)ui64_arr_out,two35m1,j,0x0); if(Res36m1) *Res36m1 = mi64_div_by_scalar64((uint64*)ui64_arr_out,two36m1,j,0x0); -// fprintf(stderr,"Res35m1,Res36m1: %llu,%llu\n",*Res35m1,*Res36m1); +// fprintf(stderr,"Res35m1,Res36m1: %" PRIu64 ",%" PRIu64 "\n",*Res35m1,*Res36m1); } /*********************/ @@ -6029,7 +6029,7 @@ uint32 get_default_factoring_depth(uint64 p) int is_hex_string(char*s, int len) { int i; - ASSERT(HERE, s != 0x0, "Null ptr to is_hex_string()"); + ASSERT(s != 0x0, "Null ptr to is_hex_string()"); for(i = 0; i < len; ++i) { if( !isxdigit(s[i]) ) @@ -6101,7 +6101,7 @@ char*check_kbnc(char*in_str, uint64*p) { if((char_addr = strstr(cptr, ",")) == 0x0) { fprintf(stderr,"%s: Expected ',' not found in assignment-specifying line!\n",func); break; } - *p = strtoull(char_addr+1, &cptr, 10); ASSERT(HERE, *p != -1ull, "strtoull() overflow detected."); + *p = strtoull(char_addr+1, &cptr, 10); ASSERT(*p != -1ull, "strtoull() overflow detected."); if(*p > PMAX) { fprintf(stderr,"%s: Exponent n in modulus expression m = k*b^n+c exceeds limit! (Suggest checking for unsigned overflow.)\n",func); break; } @@ -6161,7 +6161,7 @@ void generate_JSON_report( const char*pm1_status[2] = {"NF","F"}; const char*false_or_true[2] = {"false","true"}; // Attempt to read 32-hex-char Primenet assignment ID for current assignment (first line of WORKFILE): - ASSERT(HERE,(fp = mlucas_fopen(WORKFILE, "r")) != 0x0,"Workfile not found!"); + ASSERT((fp = mlucas_fopen(WORKFILE, "r")) != 0x0,"Workfile not found!"); // v20.1.1: Parse first line whose leading non-WS char is alphabetic: char_addr = 0x0; while(fgets(in_line, STR_MAX_LEN, fp) != 0x0) { @@ -6169,10 +6169,10 @@ void generate_JSON_report( if(isalpha(*char_addr)) break; } fclose(fp); fp = 0x0; - ASSERT(HERE,strlen(char_addr) != 0 && isalpha(*char_addr),"Eligible assignment (leading non-WS char alphabetic) not found in workfile!"); + ASSERT(strlen(char_addr) != 0 && isalpha(*char_addr),"Eligible assignment (leading non-WS char alphabetic) not found in workfile!"); if(!strstr(in_line, ESTRING) && !(MODULUS_TYPE == MODULUS_TYPE_FERMAT && strstr(in_line, BIN_EXP)) ) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: Current exponent %s not found in %s file!\n",ESTRING,WORKFILE); - ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: Current exponent %s not found in %s file!\n",ESTRING,WORKFILE); + ASSERT(0,cbuf); } // Is there a Primenet-server 32-hexit assignment ID in the assignment line? If so, include it in the JSON output: char_addr = strstr(in_line, "="); @@ -6186,9 +6186,9 @@ void generate_JSON_report( if(TEST_TYPE == TEST_TYPE_PRIMALITY) { snprintf(ttype,10,"LL"); if(*aid) { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer,aid); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer,aid); } else { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,n,RES_SHIFT,VERSION,timebuffer); } } else if(TEST_TYPE == TEST_TYPE_PRP && KNOWN_FACTORS[0]) { // PRP-CF result // Print list of known factors used for CF test. Unlike the Primenet assignment formtting on the input side, @@ -6207,42 +6207,42 @@ void generate_JSON_report( strcat( cbuf, "]"); snprintf(ttype,10,"PRP-%u",PRP_BASE); if(*aid) { - snprintf_nowarn(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid); } else { - snprintf_nowarn(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"known-factors\":%s, \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":5, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,cbuf,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer); } } else if(TEST_TYPE == TEST_TYPE_PRP) { // Only support type-1 PRP tests, so hardcode that subfield: snprintf(ttype,10,"PRP-%u",PRP_BASE); if(*aid) { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer,aid); } else { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%llu, \"worktype\":\"%s\", \"res64\":\"%016llX\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%llu, \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%c\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"res64\":\"%016" PRIX64 "\", \"residue-type\":1, \"res2048\":\"%s\", \"fft-length\":%u, \"shift-count\":%" PRIu64 ", \"error-code\":\"00000000\", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",prp_status[isprime],p,ttype,Res64,Res2048,n,RES_SHIFT,VERSION,timebuffer); } } else if(TEST_TYPE == TEST_TYPE_PM1) { // For p-1 assume there was an AID in the assignment, even if an all-0s one: snprintf(ttype,10,"PM1"); if(!strlen(factor)) { // No factor was found: if(*aid) { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer,aid); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer,aid); } else { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[0],p,ttype,n,B1,B2,VERSION,timebuffer); } } else { // The factor in the eponymous arglist field was found: if(B2 <= B1) { // No stage 2 was run if(*aid) { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer,aid); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer,aid); } else { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,factor,VERSION,timebuffer); } } else { // Include B2 and flag indicating whether the s2 interval was completely covered or not. Factor must be in "" due to possibility of > 64-bit, which overflows a JSON int: if(*aid) { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer,aid); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\", \"aid\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer,aid); } else { - snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%llu, \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%llu, \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer); + snprintf(cstr,STR_MAX_LEN,"{\"status\":\"%s\", \"exponent\":%" PRIu64 ", \"worktype\":\"%s\", \"fft-length\":%u, \"B1\":%u, \"B2\":%" PRIu64 ", \"partial-stage-2\":%s, \"factors\":[\"%s\"], \"program\":{\"name\":\"Mlucas\", \"version\":\"%s\"}, \"timestamp\":\"%s\"}\n",pm1_status[1],p,ttype,n,B1,B2,false_or_true[s2_partial],factor,VERSION,timebuffer); } } } } else - ASSERT(HERE, 0, "Unsupported test type!"); + ASSERT(0, "Unsupported test type!"); } /*********************/ @@ -6307,7 +6307,7 @@ void dif1_dit1_func_name( case 4032: *func_dif_pass1 = radix4032_dif_pass1; *func_dit_pass1 = radix4032_dit_pass1; break; // case 4096: *func_dif_pass1 = radix4096_dif_pass1; *func_dit_pass1 = radix4096_dit_pass1; break; default: - sprintf(cbuf,"ERROR: radix %d not available for [dif,dit] pass1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for [dif,dit] pass1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } @@ -6325,14 +6325,14 @@ uint32 extract_known_factors(uint64 p, char*fac_start) { uint64 *fac = 0x0, twop[4], quo[4],rem[4]; // fac = ptr to each mi64-converted factor input string; uint256 p256,q256,res256; char*cptr = fac_start+1; - ASSERT(HERE, fac_start[0] == '\"',"Known-factors line of worktodo must consist of a comma-separated list of such enclosed in double-quotes!"); + ASSERT(fac_start[0] == '\"',"Known-factors line of worktodo must consist of a comma-separated list of such enclosed in double-quotes!"); /* If it's a Fermat number, need to check size of 2^ESTRING: */ if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { findex = (uint32)p; if(findex <= MAX_PRIMALITY_TEST_BITS) p = (uint64)1 << findex; else - ASSERT(HERE, 0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS"); + ASSERT(0,"nbits_in_p <= MAX_PRIMALITY_TEST_BITS"); } // Factors separated by commas (first clause of while()); list terminated with " (2nd clause): while((char_addr = strstr(cptr,",")) != 0x0 || (char_addr = strstr(cptr,"\"")) != 0x0) { @@ -6340,34 +6340,34 @@ uint32 extract_known_factors(uint64 p, char*fac_start) { strncpy(cbuf,cptr,nchar); cbuf[nchar] = '\0'; // Extract current-factor-as-string into cbuf // Convert stringified factor f to mi64 form: lenf = 0; fac = convert_base10_char_mi64(cbuf, &lenf); // This does the mem-alloc for us - ASSERT(HERE, lenf > 0, "Error converting known-factor string!"); - ASSERT(HERE, lenf < 5, "known-factor out of range, must be < 2^256!"); + ASSERT(lenf > 0, "Error converting known-factor string!"); + ASSERT(lenf < 5, "known-factor out of range, must be < 2^256!"); fbits = (lenf<<6) - mi64_leadz(fac, lenf); // Make sure the alleged factor is of the proper form: // For Mersenne M(p), q = 2.k.p + 1, with p prime; For Fermat F_n = 2^2^n+1, q = k.2^(n+2) + 1 // and we store the binary exponent 2^n in p, and 2^(n+2) in twop (yes, a misnomer in this case): if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { - mi64_set_eq_scalar(twop,p<<1,lenf); ASSERT(HERE, p < 0x8000000000000000ull, "Mersenne exponent limited to 63 bits!"); + mi64_set_eq_scalar(twop,p<<1,lenf); ASSERT(p < 0x8000000000000000ull, "Mersenne exponent limited to 63 bits!"); } else { - mi64_set_eq_scalar(twop,p<<2,lenf); ASSERT(HERE, p < 0x4000000000000000ull, "Fermat-number index must be < 62!"); + mi64_set_eq_scalar(twop,p<<2,lenf); ASSERT(p < 0x4000000000000000ull, "Fermat-number index must be < 62!"); } mi64_div(fac,twop, lenf,lenf, quo,rem); - i = mi64_cmp_eq_scalar(rem,1ull,lenf); ASSERT(HERE, i,"Factor not of required form!"); + i = mi64_cmp_eq_scalar(rem,1ull,lenf); ASSERT(i,"Factor not of required form!"); // Alloc 4 limbs per factor in KNOWN_FACTORS; if current factor needs just 1 there's no uninited // problem with the high limbs since KNOWN_FACTORS is zeroed at start of each new assignment: - ASSERT(HERE, nfac < 10, "Limit of 10 known factors!"); + ASSERT(nfac < 10, "Limit of 10 known factors!"); mi64_set_eq(KNOWN_FACTORS + 4*nfac++,fac,lenf); // Verify that F is a base-3 Fermat-PRP via binary modpow, 3^(q-1) == 1 (mod q): - ASSERT(HERE, mi64_pprimeF(fac,3ull,lenf),"Factor-is-base-3-PRP check fails!"); + ASSERT(mi64_pprimeF(fac,3ull,lenf),"Factor-is-base-3-PRP check fails!"); // Verify that it's a factor via binary modpow: p256.d0 = p; p256.d1 = p256.d2 = p256.d3 = 0ull; q256.d0 = KNOWN_FACTORS[0]; q256.d1 = KNOWN_FACTORS[1]; q256.d2 = KNOWN_FACTORS[2]; q256.d3 = KNOWN_FACTORS[3]; res256 = twopmmodq256(p256,q256); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { - ASSERT(HERE, CMPEQ256(res256,ONE256),"Factor-divides-modulus check fails!"); + ASSERT(CMPEQ256(res256,ONE256),"Factor-divides-modulus check fails!"); } else { res256.d0 += 1ull; // Fermat case: check that 2^p == -1 == q - 1 (mod q): - ASSERT(HERE, CMPEQ256(res256,q256),"Factor-divides-modulus check fails!"); + ASSERT(CMPEQ256(res256,q256),"Factor-divides-modulus check fails!"); } // If find any duplicate-entries in input list, warn & remove: if(nfac > 1) { @@ -6376,7 +6376,7 @@ uint32 extract_known_factors(uint64 p, char*fac_start) { if(mi64_cmp_eq(KNOWN_FACTORS + 4*i, KNOWN_FACTORS + 4*(nfac-1), 4)) { mi64_clear(KNOWN_FACTORS + 4*(--nfac), 4); // Using cbuf as both string-arg and target string is problematic, so use 2nd string-global cstr as target: - snprintf_nowarn(cstr,STR_MAX_LEN, "WARNING: p = %llu, known-factor list entry %s is a duplicate ... removing.\n",p,cbuf); + snprintf(cstr,STR_MAX_LEN, "WARNING: p = %" PRIu64 ", known-factor list entry %s is a duplicate ... removing.\n",p,cbuf); fprintf(stderr,"%s",cstr); } } @@ -6384,9 +6384,9 @@ uint32 extract_known_factors(uint64 p, char*fac_start) { cptr = char_addr+1; // Advance 1-char past the current , or " } if(char_addr != 0x0) { - sprintf(cbuf,"%s: Unrecognized token sequence in parsing known-factors portion of assignment: \"%s\".",WORKFILE,fac_start); ASSERT(HERE,0,cbuf); + sprintf(cbuf,"%s: Unrecognized token sequence in parsing known-factors portion of assignment: \"%s\".",WORKFILE,fac_start); ASSERT(0,cbuf); } - ASSERT(HERE, nfac != 0,"Must specify at least one known factor!"); + ASSERT(nfac != 0,"Must specify at least one known factor!"); // A bit of just-for-fun code: For smaller moduli N, use mi64 utils to see if cofactor C is a base-3 PRP: #if 0 const char mod_type[2] = {'-','+'}, *is_prp[] = {"is not","is"}, exclam[2] = {'.','!'}; @@ -6397,7 +6397,7 @@ uint32 extract_known_factors(uint64 p, char*fac_start) { qvec = ALLOC_UINT64(qvec,j); // Quotient stores cofactor C = N/F if(!mvec || !qvec) { sprintf(cbuf, "ERROR: unable to allocate arrays mvec,qvec in extract_known_factors.\n"); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } // Compute Modulus N ... note mi64-vecs have no cache-oriented element padding: @@ -6423,13 +6423,13 @@ uint32 extract_known_factors(uint64 p, char*fac_start) { mi64_mul_vector(BASE_MULTIPLIER_BITS,lenf, KNOWN_FACTORS+i,k, curr_fac,&lenf); mi64_set_eq(BASE_MULTIPLIER_BITS,curr_fac,lenf); } - ASSERT(HERE, lenf <= 20, "Product of factors too large to fit into curr_fac[]!"); + ASSERT(lenf <= 20, "Product of factors too large to fit into curr_fac[]!"); // Since F << N, use Mont-mul-div for C - quotient overwrites N, no rem-vec needed, just verify that F is in fact a divisor: - ASSERT(HERE, 1 == mi64_div(mvec,BASE_MULTIPLIER_BITS, j,lenf, qvec,0x0), "C = N/F should have 0 remainder!"); + ASSERT(1 == mi64_div(mvec,BASE_MULTIPLIER_BITS, j,lenf, qvec,0x0), "C = N/F should have 0 remainder!"); k = mi64_getlen(qvec,j); // j = number of nonzero limbs in cofactor C i = mi64_pprimeF(qvec,3,k); - printf("2^%llu %c 1 %s a base-3 Fermat-PRP%c\n",p,mod_type[MODULUS_TYPE == MODULUS_TYPE_FERMAT],is_prp[i],exclam[i]); + printf("2^%" PRIu64 " %c 1 %s a base-3 Fermat-PRP%c\n",p,mod_type[MODULUS_TYPE == MODULUS_TYPE_FERMAT],is_prp[i],exclam[i]); free((void *)mvec); mvec = 0x0; exit(0); #endif @@ -6451,7 +6451,7 @@ The decimal value of the GCD is returned in gcd_str, presumed to be dimensioned uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char*const gcd_str) { #if !INCLUDE_GMP #warning INCLUDE_GMP defined == 0 at compile time ... No GCDs will be done on p-1 outputs. - snprintf(cbuf,STR_MAX_LEN,"INCLUDE_GMP defined == 0 at compile time ... No GCD will be done.\n"); + snprintf(cbuf,STR_MAX_LEN*2,"INCLUDE_GMP defined == 0 at compile time ... No GCD will be done.\n"); mlucas_fprint(cbuf,1); return 0; // If user turns off p-1 support, keep the decl of gcd() to allow pm1.c to build #else @@ -6463,8 +6463,8 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char* uint32 i, retval = 0; double tdiff = 0.0, clock1, clock2; clock1 = getRealTime(); - ASSERT(HERE, vec1 != 0x0, "Null-pointer vec1 input to GCD()!"); - ASSERT(HERE,!(p && vec2), "One and only one of p and vec2 args to GCD() must be non-null!"); + ASSERT(vec1 != 0x0, "Null-pointer vec1 input to GCD()!"); + ASSERT(!(p && vec2), "One and only one of p and vec2 args to GCD() must be non-null!"); mpz_init(gmp_arr1); mpz_init(gmp_arr2); // Init divisor, remainder, quotient, in case of nontrivial raw GCD and >= 1 known factors: mpz_init(gmp_d); mpz_init(gmp_r); mpz_init(gmp_q); @@ -6472,7 +6472,7 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char* // Import vec1 into GMP array1, least-sign. element first, host byte order within each word, at 64-bit width: mpz_import(gmp_arr1, nlimb, -1, sizeof(uint64), 0, 0, vec1); if(p != 0) { - ASSERT(HERE, nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6, "Bad inputs to GCD()!"); + ASSERT(nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6, "Bad inputs to GCD()!"); mpz_mul_2exp(gmp_arr2, gmp_one,gmp_exp); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) // 2^p-1: mpz_sub(gmp_arr2, gmp_arr2,gmp_one); @@ -6481,15 +6481,15 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char* } else { mpz_import(gmp_arr2, nlimb, -1, sizeof(uint64), 0, 0, vec2); } - sz1 = mpz_sizeinbase(gmp_arr1,2);// gmp_printf("Input1 has %llu bits\n",sz1); - sz2 = mpz_sizeinbase(gmp_arr2,2);// gmp_printf("Input2 has %llu bits\n",sz2); + sz1 = mpz_sizeinbase(gmp_arr1,2);// gmp_printf("Input1 has %" PRIu64 " bits\n",sz1); + sz2 = mpz_sizeinbase(gmp_arr2,2);// gmp_printf("Input2 has %" PRIu64 " bits\n",sz2); // Take gcd and return in gmp_arr1: mpz_gcd(gmp_arr1, gmp_arr1,gmp_arr2); gmp_size = mpz_sizeinbase(gmp_arr1,2); if(gmp_size < 2) { goto gcd_return; // GCD = 0 or 1 } else { - if(KNOWN_FACTORS[0]) fprintf(stderr,"Raw GCD has %llu bits ... dividing out any known factors...\n",(uint64)gmp_size); + if(KNOWN_FACTORS[0]) fprintf(stderr,"Raw GCD has %" PRIu64 " bits ... dividing out any known factors...\n",(uint64)gmp_size); for(i = 0; i < 40; i += 4) { // Current limit = 10 factors, each stored in a 4-limb field, i.e. < 2^256 if(!KNOWN_FACTORS[i]) break; @@ -6507,24 +6507,24 @@ uint32 gcd(uint32 stage, uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb, char* gmp_size = mpz_sizeinbase(gmp_arr1,10); // Anything >= 900 digits (~90% the value of our STR_MAX_LEN dimensioning of I/O strings) treated as suspect: if(gmp_size >= 900) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "GCD has %u digits -- possible data corruption, aborting.\n",(uint32)gmp_size); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "GCD has %u digits -- possible data corruption, aborting.\n",(uint32)gmp_size); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } retval = 1; gcd_return: if(!p) { gmp_snprintf(gcd_str,STR_MAX_LEN,"%Zd",gmp_arr1); - gmp_snprintf(cbuf,STR_MAX_LEN,"GCD(A[%llu bits], B[%llu bits]) = %s\n",sz1,sz2,gcd_str); + gmp_snprintf(cbuf,STR_MAX_LEN*2,"GCD(A[%" PRIu64 " bits], B[%" PRIu64 " bits]) = %s\n",sz1,sz2,gcd_str); } else if(retval) { gmp_snprintf(gcd_str,STR_MAX_LEN,"%Zd",gmp_arr1); - gmp_snprintf(cbuf,STR_MAX_LEN,"Found %u-digit factor in Stage %u: %s\n",gmp_size,stage,gcd_str); + gmp_snprintf(cbuf,STR_MAX_LEN*2,"Found %u-digit factor in Stage %u: %s\n",gmp_size,stage,gcd_str); } else { // Caller can use either return value or empty gcd_str as proxy for "no factor found" gcd_str[0] = '\0'; - gmp_snprintf(cbuf,STR_MAX_LEN,"Stage %u: No factor found.\n",stage); + gmp_snprintf(cbuf,STR_MAX_LEN*2,"Stage %u: No factor found.\n",stage); } mlucas_fprint(cbuf,1); clock2 = getRealTime(); tdiff = clock2 - clock1; - snprintf(cbuf,STR_MAX_LEN,"Time for GCD =%s\n",get_time_str(tdiff)); + snprintf(cbuf,STR_MAX_LEN*2,"Time for GCD =%s\n",get_time_str(tdiff)); mlucas_fprint(cbuf,1); // Done with the GMP arrays: mpz_clear(gmp_arr1); mpz_clear(gmp_arr2); mpz_clear(gmp_d); mpz_clear(gmp_r); mpz_clear(gmp_q); @@ -6562,20 +6562,20 @@ void modinv(uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb) { uint64 *export_result_addr; double tdiff = 0.0, clock1, clock2; clock1 = getRealTime(); - ASSERT(HERE, vec1 != 0x0 && vec2 != 0x0, "Null-pointer input to MODINV()!"); + ASSERT(vec1 != 0x0 && vec2 != 0x0, "Null-pointer input to MODINV()!"); mpz_init(gmp_arr1); mpz_init(gmp_arr2); mpz_init_set_ui(gmp_one,1ull); gmp_exp = p; // Import vec1 into GMP array1, least-sign. element first, host byte order within each word, at 64-bit width: // void mpz_import (mpz_t rop, size_t count, int order, size_t size, int [Function] endian, size_t nails, const void *op) mpz_import(gmp_arr1, nlimb, -1, sizeof(uint64), 0, 0, vec1); - ASSERT(HERE, (p != 0) && (nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6), "Bad inputs to MODINV()!"); + ASSERT((p != 0) && (nlimb == (p + 63 + (MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6), "Bad inputs to MODINV()!"); mpz_mul_2exp(gmp_arr2, gmp_one,gmp_exp); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) // 2^p-1: mpz_sub(gmp_arr2, gmp_arr2,gmp_one); else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT)// F(m): p holds 2^m, so F(m) = 2^p+1: mpz_add(gmp_arr2, gmp_arr2,gmp_one); -// gmp_printf("Input1 has %llu bits\n",mpz_sizeinbase(gmp_arr1,2)); -// gmp_printf("Input2 has %llu bits\n",mpz_sizeinbase(gmp_arr2,2)); +// gmp_printf("Input1 has %" PRIu64 " bits\n",mpz_sizeinbase(gmp_arr1,2)); +// gmp_printf("Input2 has %" PRIu64 " bits\n",mpz_sizeinbase(gmp_arr2,2)); /* GMP mod-inverse; arglist as for mpz_gcd but also returns int: @@ -6590,13 +6590,13 @@ void modinv(uint64 p, uint64*vec1, uint64*vec2, uint32 nlimb) { retval = mpz_invert(gmp_arr1, gmp_arr1,gmp_arr2); gmp_size = mpz_sizeinbase(gmp_arr1,2); if(!retval) { - snprintf(cbuf,STR_MAX_LEN,"MODINV: Fatal error: inverse does not exist.\n"); - mlucas_fprint(cbuf,0); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"MODINV: Fatal error: inverse does not exist.\n"); + mlucas_fprint(cbuf,0); ASSERT(0,cbuf); } // Export the result from gmp_arr1 to destination array vec2: // void * mpz_export (void *rop, size_t *countp, int order, size_t size, int [Function] endian, size_t nails, const mpz_t op) export_result_addr = mpz_export(vec2, &inv_limbs, -1, sizeof(uint64), 0, 0, gmp_arr1); - ASSERT(HERE, inv_limbs <= nlimb && export_result_addr == vec2, "GMP was unable to export result to the specified target array!"); + ASSERT(inv_limbs <= nlimb && export_result_addr == vec2, "GMP was unable to export result to the specified target array!"); // Explicitly zero any excess limbs left at top of vec2: for(i = inv_limbs; i < nlimb; i++) { vec2[i] = 0ull; @@ -6637,7 +6637,7 @@ int restart_file_valid(const char*fname, const uint64 p, uint8*arr1, uint8*arr2) uint32 filegrep(const char*fname, const char*find_str, char*cstr, uint32 find_before_line_number) { uint32 curr_line = 0, found_line = 0; - ASSERT(HERE, cstr != 0x0, "filegrep(): cstr pointer argument must be non-null!"); + ASSERT(cstr != 0x0, "filegrep(): cstr pointer argument must be non-null!"); cstr[0] = '\0'; if(strlen(find_str) == 0) // Nothing to find return 0; @@ -6657,7 +6657,7 @@ uint32 filegrep(const char*fname, const char*find_str, char*cstr, uint32 find_be fclose(fptr); } else { sprintf(cbuf,"filegrep error: file %s not found.\n",fname); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } if(strlen(cstr) != 0) return found_line; @@ -6670,9 +6670,9 @@ void write_fft_debug_data(double a[], int jlo, int jhi) { int j,j1; const char dbg_fname[] = "FFT_DEBUG.txt"; - ASSERT(HERE, dbg_file == 0x0, "dbg_file != 0x0 prior to mlucas_fopen"); + ASSERT(dbg_file == 0x0, "dbg_file != 0x0 prior to mlucas_fopen"); dbg_file = mlucas_fopen(dbg_fname, "a"); - ASSERT(HERE, dbg_file != 0x0, "Unable to open dbg_file!"); + ASSERT(dbg_file != 0x0, "Unable to open dbg_file!"); fprintf(dbg_file, "RE_IM_STRIDE = %d\n", RE_IM_STRIDE); fprintf(dbg_file, "%s\n", cbuf); diff --git a/src/Mlucas.h b/src/Mlucas.h old mode 100755 new mode 100644 diff --git a/src/br.c b/src/br.c index 93d525cd..0e09182a 100755 --- a/src/br.c +++ b/src/br.c @@ -36,8 +36,8 @@ void print_pow2_twiddles(const uint32 n, const uint32 p, const uint32 q) const char csigns[2] = {'+','-'}; const char re_im[2] = {'c','s'}; char prefix[3]; // 0-slot for overall sign; 1 for complex operator * [Re / Im interchange], 2 for ~ [complex conjugation]. - ASSERT(HERE, n == (1<>52;\ shift = 1074 - dexp;\ -/*if(j1==0)printf("0xmant,shift,bits = %20llX %10d %10u\n",mant,shift,bits);*/\ +/*if(j1==0)printf("0xmant,shift,bits = %20" PRIX64 " %10d %10u\n",mant,shift,bits);*/\ if(shift<0)printf("WARN: j1 = %10d %20.15e gives negative shift count = %10d\n",j1,x,shift);\ if(shift < 52)\ {\ @@ -730,15 +730,15 @@ ifrac = mant << (63-shift);\ if(ifrac > ifracmax) ifracmax=ifrac;\ mant += ((uint64)1)<>(shift+1);\ -/*if(j1==0)printf("A: 0xmant = %20llX\n",mant);*/\ +/*if(j1==0)printf("A: 0xmant = %20" PRIX64 "\n",mant);*/\ mant -= (mant & sign)<<1;\ -/*if(j1==0)printf("B: 0xmant = %20llX\n",mant);*/\ +/*if(j1==0)printf("B: 0xmant = %20" PRIX64 "\n",mant);*/\ word = mant & (~(ones << bits));\ -/*if(j1==0)printf("C: 0xword = %20llX\n",word);*/\ +/*if(j1==0)printf("C: 0xword = %20" PRIX64 "\n",word);*/\ topbit= word >> (bits - 1);\ -/*if(j1==0)printf("D: 0xtbit = %20llX\n",topbit);*/\ +/*if(j1==0)printf("D: 0xtbit = %20" PRIX64 "\n",topbit);*/\ word -= topbit << bits;\ -/*if(j1==0)printf("E: 0xword = %20llX\n",word);*/\ +/*if(j1==0)printf("E: 0xword = %20" PRIX64 "\n",word);*/\ x = wt*(double)word;\ cy = (double)( (mant >> bits) + topbit );\ /*if(j1==0)printf("%20.4f %20.4f\n",x,cy);*/\ diff --git a/src/dft_macro.c b/src/dft_macro.c index b3432a7e..4b41aa29 100755 --- a/src/dft_macro.c +++ b/src/dft_macro.c @@ -3396,18 +3396,18 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 if(thr_id == -1) // Value of init stores #threads { if(init <= max_threads) { // Previously inited with sufficient #threads - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); return; } max_threads = init; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif if(sc_arr) { free((void *)sc_arr); } // 126 slots for DFT-63 data, 22 for DFT-7,9 consts and DFT-7 pads, 4 to allow for alignment = 152: - sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = tdat = sc_ptr; tmp = tdat + 126; @@ -3491,12 +3491,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 #endif return; } else { - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); } /* end of inits */ /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); tdat = __r0 + thr_id*152; tmp = tdat + 126; two = tmp + 0x0; one = tmp + 0x1; @@ -3645,18 +3645,18 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 if(thr_id == -1) // Value of init stores #threads { if(init <= max_threads) { // Previously inited with sufficient #threads - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); return; } max_threads = init; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif if(sc_arr) { free((void *)sc_arr); } // 126 slots for DFT-63 data, 22 for DFT-7,9 consts and DFT-7 pads, 4 to allow for alignment = 152: - sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = tdat = sc_ptr; tmp = tdat + 126; @@ -3740,12 +3740,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 #endif return; } else { - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); } /* end of inits */ /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); tdat = __r0 + thr_id*152; tmp = tdat + 126; two = tmp + 0x0; one = tmp + 0x1; @@ -3877,17 +3877,17 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 if(thr_id == -1) // Value of init stores #threads { if(init <= max_threads) { // Previously inited with sufficient #threads - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); return; } max_threads = init; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif if(sc_arr) { free((void *)sc_arr); } - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = tmp = sc_ptr; @@ -3925,7 +3925,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // VEC_DBL_INIT(sqrt2, SQRT2); VEC_DBL_INIT(tmp, ISRT2); VEC_DBL_INIT(nisrt2,-dtmp); VEC_DBL_INIT( isrt2, dtmp); // Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround: - VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant + VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant VEC_DBL_INIT( cc1, c64_1); VEC_DBL_INIT( ss1, s64_1); tmp = cc1-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc2, c32_1); VEC_DBL_INIT( ss2, s32_1); tmp = cc2-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc3, c64_3); VEC_DBL_INIT( ss3, s64_3); tmp = cc3-1; VEC_DBL_INIT(tmp, dtmp); @@ -4013,7 +4013,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // VEC_DBL_INIT(sqrt2, SQRT2); VEC_DBL_INIT(tmp, ISRT2); VEC_DBL_INIT(nisrt2,-dtmp); VEC_DBL_INIT( isrt2, dtmp); // Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround: - VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant + VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant VEC_DBL_INIT( cc1, c64_1); VEC_DBL_INIT( ss1, s64_1); tmp = cc1-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc2, c32_1); VEC_DBL_INIT( ss2, s32_1); tmp = cc2-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc3, c64_3); VEC_DBL_INIT( ss3, s64_3); tmp = cc3-1; VEC_DBL_INIT(tmp, dtmp); @@ -4032,12 +4032,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // fprintf(stderr, "Init SSE2_RADIX_64_DIF with max_threads = %d\n",max_threads); return; } else { - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); } /* end of inits */ /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); tmp = __r0 + thr_id*0x32; // To support FMA versions of the radix-8 macros used to build radix-64 we insert a standalone copy of the [2,1,sqrt2,isrt2] quartet: two = tmp + 0; // AVX+ versions of various DFT macros assume consts 2.0,1.0,isrt2 laid out thusly @@ -4280,17 +4280,17 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 if(thr_id == -1) // Value of init stores #threads { if(init <= max_threads) { // Previously inited with sufficient #threads - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); return; } max_threads = init; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif if(sc_arr) { free((void *)sc_arr); } - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = tmp = sc_ptr; @@ -4328,7 +4328,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // VEC_DBL_INIT(sqrt2, SQRT2); VEC_DBL_INIT(tmp, ISRT2); VEC_DBL_INIT(nisrt2,-dtmp); VEC_DBL_INIT( isrt2, dtmp); // Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround: - VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant + VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant VEC_DBL_INIT( cc1, c64_1); VEC_DBL_INIT( ss1, s64_1); tmp = cc1-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc2, c32_1); VEC_DBL_INIT( ss2, s32_1); tmp = cc2-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc3, c64_3); VEC_DBL_INIT( ss3, s64_3); tmp = cc3-1; VEC_DBL_INIT(tmp, dtmp); @@ -4416,7 +4416,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // VEC_DBL_INIT(sqrt2, SQRT2); VEC_DBL_INIT(tmp, ISRT2); VEC_DBL_INIT(nisrt2,-dtmp); VEC_DBL_INIT( isrt2, dtmp); // Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround: - VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant + VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant VEC_DBL_INIT( cc1, c64_1); VEC_DBL_INIT( ss1, s64_1); tmp = cc1-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc2, c32_1); VEC_DBL_INIT( ss2, s32_1); tmp = cc2-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc3, c64_3); VEC_DBL_INIT( ss3, s64_3); tmp = cc3-1; VEC_DBL_INIT(tmp, dtmp); @@ -4435,12 +4435,12 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // fprintf(stderr, "Init SSE2_RADIX_64_DIT with max_threads = %d\n",max_threads); return; } else { - ASSERT(HERE, sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); + ASSERT(sc_arr != 0, "This function requires an initial Init-consts-mode call (in 1-thread mode only) before use!"); } /* end of inits */ /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); tmp = __r0 + thr_id*0x32; // To support FMA versions of the radix-8 macros used to build radix-64 we insert a standalone copy of the [2,1,sqrt2,isrt2] quartet: two = tmp + 0; // AVX+ versions of various DFT macros assume consts 2.0,1.0,isrt2 laid out thusly @@ -4888,7 +4888,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1 // Index-offset names here reflect original unpermuted inputs, but the math also works for permuted ones: int i,j,nshift, *off_ptr; int p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pa,pb,pc,pd,pe,pf; - ASSERT(HERE, o_idx != 0x0, "Null o_idx pointer in SSE2_RADIX256_DIF!"); + ASSERT(o_idx != 0x0, "Null o_idx pointer in SSE2_RADIX256_DIF!"); // NOTE that unlike the RADIX_08_DIF_OOP() macro used for pass 1 of the radix-64 DFT, RADIX_16_DIF outputs are IN-ORDER rather than BR: #ifdef USE_ARM_V8_SIMD uint32 OFF1,OFF2,OFF3,OFF4; diff --git a/src/f2psp.h b/src/f2psp.h index 11ee2170..c6b7ba96 100755 --- a/src/f2psp.h +++ b/src/f2psp.h @@ -37,8 +37,8 @@ extern "C" { retval \ )\ {\ - DBG_ASSERT(HERE, qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\ - DBG_ASSERT(HERE, ((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\ + DBG_ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!");\ + DBG_ASSERT(((uint32)&a[0] & 0x3f) == 0, "A-array not 64-byte aligned!");\ __asm mov eax, array_64x8inputs /* Assumes inputs a,b,c,d,... are 64-bit separated and &a[0} is 64-byte aligned */\ __asm lea ebx, q\ __asm lea ecx, qinv\ diff --git a/src/factor.c b/src/factor.c index e23b1bbd..bed29234 100755 --- a/src/factor.c +++ b/src/factor.c @@ -196,7 +196,7 @@ int restart; uint64 PMIN; /* minimum #bits allowed for FFT-based mul */ uint64 PMAX; /* maximum #bits allowed depends on max. FFT length allowed and will be determined at runtime, via call to given_N_get_maxP(). */ - char cbuf[STR_MAX_LEN],cstr[STR_MAX_LEN]; + char cbuf[STR_MAX_LEN*2],cstr[STR_MAX_LEN]; char in_line[STR_MAX_LEN]; /* Declare a blank STATFILE string to ease program logic: */ char STATFILE[] = ""; @@ -547,7 +547,7 @@ Unlike for (mod 60), use simple utility functions to manage these, rather than a int factor(char *pstring, double bmin, double bmax) { - ASSERT(HERE, 0, "TF currently not supported as part of Mlucas, only via standalone Mfactor build - please delete any .o files and retry USING 'makemake.sh mfac' from Mluas dir above /src."); + ASSERT(0, "TF currently not supported as part of Mlucas, only via standalone Mfactor build - please delete any .o files and retry USING 'makemake.sh mfac' from Mluas dir above /src."); return 1; } @@ -732,7 +732,7 @@ int main(int argc, char *argv[]) if(cudaError != cudaSuccess) { printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError)); - ASSERT(HERE, 0, "factor.c : GPU-side error detected!"); + ASSERT(0, "factor.c : GPU-side error detected!"); } #endif @@ -743,7 +743,7 @@ int main(int argc, char *argv[]) /* Allocate factor_k array and align on 16-byte boundary: */ factor_ptmp = ALLOC_UINT64(factor_ptmp, 24); factor_k = ALIGN_UINT64(factor_ptmp); factor_ptmp = 0x0; - ASSERT(HERE, ((uint64)factor_k & 0x3f) == 0, "factor_k not 64-byte aligned!"); + ASSERT(((uint64)factor_k & 0x3f) == 0, "factor_k not 64-byte aligned!"); /*...initialize logicals and factoring parameters... */ restart = FALSE; @@ -924,14 +924,14 @@ Others are optional and in some cases mutually exclusive: { strncpy(stFlag, argv[nargs++], STR_MAX_LEN); passmin = (uint32)convert_base10_char_uint64(stFlag); - ASSERT(HERE, passmin < TF_PASSES,"factor.c: passmin < TF_PASSES"); + ASSERT(passmin < TF_PASSES,"factor.c: passmin < TF_PASSES"); } else if(STREQ(stFlag, "-passmax")) { strncpy(stFlag, argv[nargs++], STR_MAX_LEN); passmax = (uint32)convert_base10_char_uint64(stFlag); - ASSERT(HERE, passmax < TF_PASSES,"factor.c: passmax < TF_PASSES"); - ASSERT(HERE, passmax >= passmin ,"factor.c: passmax >= passmin"); + ASSERT(passmax < TF_PASSES,"factor.c: passmax < TF_PASSES"); + ASSERT(passmax >= passmin ,"factor.c: passmax >= passmin"); } // Number of threads to use? @@ -953,7 +953,7 @@ Others are optional and in some cases mutually exclusive: NTHREADS = itmp; } #ifdef NWORD - ASSERT(HERE, NTHREADS == 1, "Arbitrary-precision build currently only supports single-threaded runs!"); + ASSERT(NTHREADS == 1, "Arbitrary-precision build currently only supports single-threaded runs!"); #endif #endif } @@ -968,11 +968,11 @@ Others are optional and in some cases mutually exclusive: #else /* If non-standalone mode, make sure statfile name is non-empty: */ - ASSERT(HERE, STRNEQ(STATFILE, ""), "STATFILE string empty"); + ASSERT(STRNEQ(STATFILE, ""), "STATFILE string empty"); fp = mlucas_fopen(STATFILE, "a"); if(!fp) { fprintf(stderr,"ERROR: Unable to open statfile %s for writing.\n",STATFILE); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } else { fclose(fp); fp = 0x0; } @@ -985,12 +985,12 @@ Others are optional and in some cases mutually exclusive: first_entry = FALSE; #ifndef MULTITHREAD #warning Building factor.c in unthreaded (i.e. single-main-thread) mode. - ASSERT(HERE, NTHREADS == 1, "NTHREADS must == 1 in single-threaded mode!"); + ASSERT(NTHREADS == 1, "NTHREADS must == 1 in single-threaded mode!"); k_to_try = (uint64 *)calloc(TRYQ * NTHREADS, sizeof(uint64)); #else MAX_THREADS = get_num_cores(); - ASSERT(HERE, MAX_THREADS > 0, "Illegal #Cores value stored in MAX_THREADS"); - ASSERT(HERE, MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h ."); + ASSERT(MAX_THREADS > 0, "Illegal #Cores value stored in MAX_THREADS"); + ASSERT(MAX_THREADS <= MAX_CORES,"MAX_THREADS exceeds the MAX_CORES setting in Mdata.h ."); if(!NTHREADS) { NTHREADS = 1; @@ -998,7 +998,7 @@ Others are optional and in some cases mutually exclusive: // Use the same affinity-setting code here as for the -cpu option, but simply for cores [0:NTHREADS-1]: } else if(NTHREADS > MAX_CORES) { sprintf(cbuf,"FATAL: NTHREADS = %d exceeds the MAX_CORES setting in Mdata.h = %d\n", NTHREADS, MAX_CORES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } else { // In timing-test mode, allow #threads > #cores if(NTHREADS > MAX_THREADS) { fprintf(stderr,"WARN: NTHREADS = %d exceeds number of cores = %d\n", NTHREADS, MAX_THREADS); @@ -1021,7 +1021,7 @@ Others are optional and in some cases mutually exclusive: // do TF_PASSES 'work units' (factoring passes for various (k mod TF_CLASSES) k-classes: main_work_units = 0; pool_work_units = NTHREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("Factor.c: Init threadpool of %d threads\n", NTHREADS); // Apr 2015: Init-calls to any inline-asm-using modpow functions: @@ -1050,7 +1050,7 @@ Others are optional and in some cases mutually exclusive: // Oct 2015: GCD-associated self-tests provides a fair bit of added coverage of the mi64 library, so always include: #ifdef INCLUDE_PM1 /* Simple self-tester for GCD routines in gcd_lehmer.c: */ - ASSERT(HERE, test_gcd() == 0, "Factor_init : GCD test failed.\n"); + ASSERT(test_gcd() == 0, "Factor_init : GCD test failed.\n"); exit(0); #endif @@ -1058,44 +1058,44 @@ exit(0); command-line parameter, will attempt to read the other needed run parameters from the corresponding checkpoint file: */ - ASSERT(HERE, STRNEQ(pstring,""),"factor.c : pstring empty!"); + ASSERT(STRNEQ(pstring,""),"factor.c : pstring empty!"); /* -bmin/bmax used to set bounds for factoring: */ if(bmin || bmax) { - ASSERT(HERE, (kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0)"); + ASSERT((kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0)"); if(bmin < 0) { - fprintf(stderr,"ERROR: log2(min factor) must be >= 0. Offending entry = %lf.\n", bmin); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: log2(min factor) must be >= 0. Offending entry = %lf.\n", bmin); ASSERT(0,"0"); } else if(bmin >= MAX_BITS_Q) { - fprintf(stderr,"ERROR: log2(min factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmin); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: log2(min factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmin); ASSERT(0,"0"); } if(bmax <= 0) { - fprintf(stderr,"ERROR: log2(max factor) must be > 0. Offending entry = %lf.\n", bmax); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: log2(max factor) must be > 0. Offending entry = %lf.\n", bmax); ASSERT(0,"0"); } else if(bmax > MAX_BITS_Q) { - fprintf(stderr,"ERROR: log2(max factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmax); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: log2(max factor) exceeds allowable limit of %u. Offending entry = %lf.\n", MAX_BITS_Q, bmax); ASSERT(0,"0"); } if(bmax < bmin) { - fprintf(stderr,"ERROR: (bmax = %lf) < (bmin = %lf)!\n", bmax, bmin); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: (bmax = %lf) < (bmin = %lf)!\n", bmax, bmin); ASSERT(0,"0"); } } /* -kmin/kmax used to set bounds for factoring: */ if(kmin || kmax) { - ASSERT(HERE, kmax != 0 ,"factor.c: kmax not set!"); - ASSERT(HERE, (int64)kmax > 0, "kmax must be 63 bits or less!"); - ASSERT(HERE, (bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)"); + ASSERT(kmax != 0 ,"factor.c: kmax not set!"); + ASSERT((int64)kmax > 0, "kmax must be 63 bits or less!"); + ASSERT((bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)"); if(kmax < kmin) { fprintf(stderr,"ERROR: (kmax = %s) < (kmin = %s)!\n", &char_buf0[convert_uint64_base10_char(char_buf0, kmax)], &char_buf1[convert_uint64_base10_char(char_buf1, kmin)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } - ASSERT(HERE, bmax > 0.0 || kmax != 0 ,"factor.c: One of bmax or kmax must be set!"); + ASSERT(bmax > 0.0 || kmax != 0 ,"factor.c: One of bmax or kmax must be set!"); - ASSERT(HERE, (MODULUS_TYPE == MODULUS_TYPE_MERSENNE) + ASSERT((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) || (MODULUS_TYPE == MODULUS_TYPE_MERSMERS) || (MODULUS_TYPE == MODULUS_TYPE_FERMAT) , "Unsupported modulus type!"); @@ -1119,7 +1119,7 @@ exit(0); nbits_in_p = findex; if(findex > 1000) { // Large MMp need deeper sieving on each k passing the default sieve kdeep = (uint32 *)calloc( 1024, sizeof(uint32)); - ASSERT(HERE, kdeep != 0x0, "Calloc of kdeep[] failed!"); + ASSERT(kdeep != 0x0, "Calloc of kdeep[] failed!"); } lenP = (nbits_in_p + 63)>>6; p = (uint64 *)calloc( ((uint32)MAX_BITS_P + 63)>>6, sizeof(uint64)); @@ -1131,7 +1131,7 @@ exit(0); } else { // Convert stringified exponent to mi64 form, using same #limbs as for factor candidates: p = convert_base10_char_mi64(pstring, &lenQ); // This does the mem-alloc for us in this case - lenP = mi64_getlen(p, lenQ); ASSERT(HERE, lenP > 0, "factor.c: Error converting pstring!"); + lenP = mi64_getlen(p, lenQ); ASSERT(lenP > 0, "factor.c: Error converting pstring!"); nbits_in_p = (lenP<<6) - mi64_leadz(p, lenP); } @@ -1157,14 +1157,14 @@ exit(0); // Mersenne numbers must have odd (check primality further on) exponents: if((MODULUS_TYPE != MODULUS_TYPE_FERMAT) && (p[0] & 1) == 0) { - fprintf(stderr,"p must be odd! Offending p = %s\n", pstring); ASSERT(HERE, 0,"0"); + fprintf(stderr,"p must be odd! Offending p = %s\n", pstring); ASSERT(0,"0"); } /* For purposes of the bits-in-p limit, treat Fermat numbers as having 2^findex rather than 2^findex + 1 bits: */ if((nbits_in_p - (MODULUS_TYPE == MODULUS_TYPE_FERMAT)) > MAX_BITS_P) { fprintf(stderr,"p too large - limit is %u bits. Offending p = %s\n", MAX_BITS_P, pstring); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } // To track lg(q) = lg(2.k.p+1), use approximation q ~= 2.k.p, thus lg(q) ~= lg(2.p) + lg(k). fbits_in_2p = (double)mi64_extract_lead64(p, lenP, &itmp64) - 64; @@ -1173,16 +1173,16 @@ exit(0); //printf("fbits_in_2p += log((double)itmp64)*ILG2 [= %10.4f] = %10.4f\n",log((double)itmp64)*ILG2,fbits_in_2p); #if 0 // 11/2013: No clue what I was thinking here... // If 2p < 2^64 we left-justify the leading bits to make result lie in [2^63, 2^64), so result here must always be > 2^63: - ASSERT(HERE, fbits_in_2p >= 63, "fbits_in_2p out of range!"); + ASSERT(fbits_in_2p >= 63, "fbits_in_2p out of range!"); fbits_in_2p += nbits_in_p - 64.0; // lg(2.p) ... Cast 64 to double to avoid signed-int subtract of RHS terms. #endif // Do some quick sanity tests of exponent for the various kinds of moduli: if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, findex == mi64_trailz(p, lenP), "Internal Fermat-exponent bad power of 2!"); + ASSERT(findex == mi64_trailz(p, lenP), "Internal Fermat-exponent bad power of 2!"); mi64_shrl(p, q, findex, lenP,lenP); mi64_sub_scalar(q, 1ull, q, lenP); - ASSERT(HERE, mi64_iszero(q, lenP), "Internal Fermat-exponent not a power of 2!"); + ASSERT(mi64_iszero(q, lenP), "Internal Fermat-exponent not a power of 2!"); } else { @@ -1193,14 +1193,14 @@ exit(0); { if(findex == knowns[i]) { break; } } - ASSERT(HERE, (knowns[i] != 0), "Double-Mersenne exponent not a known Mersenne prime!"); + ASSERT((knowns[i] != 0), "Double-Mersenne exponent not a known Mersenne prime!"); // And now proceed to all-binary-ones test of vector-form M(p): mi64_add_scalar(p, 1ull, q, lenP); - ASSERT(HERE, findex == mi64_trailz(q, lenP), "Internal M(M(p))-exponent bad power of 2!"); + ASSERT(findex == mi64_trailz(q, lenP), "Internal M(M(p))-exponent bad power of 2!"); mi64_shrl(q, q, findex, lenP,lenP); mi64_sub_scalar(q, 1ull, q, lenP); - ASSERT(HERE, mi64_iszero(q, lenP), "Internal M(M(p))-exponent fails all-binary-ones check!"); + ASSERT(mi64_iszero(q, lenP), "Internal M(M(p))-exponent fails all-binary-ones check!"); } // We can use a lookup table vs known M(p) for all cases, but if Mersenne or M(M(p)) with suitably small p, // add a base-2 Fermat PRP test, more as a self-test of the various modpow routines than anything else: @@ -1209,7 +1209,7 @@ exit(0); if(!mi64_twopmodq(q, lenP, 0, p, lenP, 0x0)) { fprintf(stderr,"WARNING: p = %s is not prime ... proceeding anyway, on presumption user wants this.\n", pstring); - // ASSERT(HERE, 0,"0"); Dec 2019 ... allowing odd composite exponents can still be useful, e.g. ATH used to TF M(p^2) for known Mersenne primes + // ASSERT(0,"0"); Dec 2019 ... allowing odd composite exponents can still be useful, e.g. ATH used to TF M(p^2) for known Mersenne primes } } } @@ -1278,14 +1278,14 @@ exit(0); if(kmax) { interval_hi = (uint64)ceil((double)kmax/((uint64)len << TF_CLSHIFT)); // Copied from restart-file code below u64_arr[lenP] = mi64_mul_scalar( p, 2*interval_hi*(len << TF_CLSHIFT), u64_arr, lenP); - ASSERT(HERE, lenQ == lenP+(u64_arr[lenP] != 0), ""); + ASSERT(lenQ == lenP+(u64_arr[lenP] != 0), ""); nbits_in_q = (lenQ<<6) - mi64_leadz(u64_arr, lenQ); if(nbits_in_q > MAX_BITS_Q) { fprintf(stderr,"qmax too large - limit is %u bits. Offending p, kmax = %s, %s\n", MAX_BITS_Q, pstring, &char_buf0[convert_uint64_base10_char(char_buf0, kmax)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -1311,7 +1311,7 @@ exit(0); else fprintf(stderr,"INFO: Will write checkpoint data to savefile %s.\n",RESTARTFILE); - fprintf(stderr,"INFO: Will write savefile %s every 2^%u = %llu factor candidates tried.\n",RESTARTFILE,CMASKBITS,countmask+1); + fprintf(stderr,"INFO: Will write savefile %s every 2^%u = %" PRIu64 " factor candidates tried.\n",RESTARTFILE,CMASKBITS,countmask+1); /**** process restart-file and any command-line params: ****/ // Note: return value of read_savefile is signed: @@ -1323,9 +1323,9 @@ exit(0); fq = mlucas_fopen(STATFILE,"a"); fprintf(fq,"%s",cbuf); fclose(fq); fq = 0x0; #endif // Init savefile with above read_savefile fields so ensuing checkpoint-writes only need to update the pass# and k: -// ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!"); +// ASSERT(0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!"); } else { - ASSERT(HERE,!itmp,"There were errors reading the savefile ... aborting"); + ASSERT(!itmp,"There were errors reading the savefile ... aborting"); count = 0ull; // Need to reset == 0 prior to sieving so kvector-fill code works properly /* If previous run is not yet complete, ignore any increased factor-bound-related @@ -1349,19 +1349,19 @@ exit(0); ****/ if(bmin || bmax) { #if(!defined(P1WORD)) - // ASSERT(HERE, 0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!"); + // ASSERT(0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!"); #endif - ASSERT(HERE, (kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run."); + ASSERT((kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run."); if(bmin) { - ASSERT(HERE, bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file"); + ASSERT(bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file"); if(bmin < bmax_file) fprintf(stderr,"WARNING: Specified bmin (%lf) smaller than previous-run bmax = %lf. Setting equal to avoid overlapping runs.\n", bmin, bmax_file); } bmin = bmax_file; /* We expect any command-line bmax will be > that in the restart file: */ if(bmax) - ASSERT(HERE, bmax > bmax_file - 0.0000000001,"bmax >= bmax_file"); + ASSERT(bmax > bmax_file - 0.0000000001,"bmax >= bmax_file"); } /**** @@ -1371,26 +1371,26 @@ exit(0); if not we warn and set kmin = kmax_file), and that kmax > kmax_file. ****/ if(kmin || kmax) { - ASSERT(HERE, (bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)"); + ASSERT((bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)"); if(kmin) { - ASSERT(HERE, kmin >= kmin_file,"kmin >= kmin_file"); + ASSERT(kmin >= kmin_file,"kmin >= kmin_file"); if(kmin < kmax_file) fprintf(stderr,"WARNING: Specified kmin (%s) smaller than previous-run kmax = %s. Setting equal to avoid overlapping runs.\n", &char_buf0[convert_uint64_base10_char(char_buf0, kmax)], &char_buf1[convert_uint64_base10_char(char_buf1, kmax_file)]); } kmin = kmax_file; /* We expect any command-line kmax will be > that in the restart file: */ if(kmax) - ASSERT(HERE, kmax > kmax_file,"kmax >= kmax_file"); + ASSERT(kmax > kmax_file,"kmax >= kmax_file"); } /**** 3) -kplus used to increment an upper bound from a previous factoring run: ****/ if(kplus) { - ASSERT(HERE, (bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)"); + ASSERT((bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)"); kmin = kmax_file; /* Ensure incremented value kmax fits into a 64-bit unsigned int: */ - ASSERT(HERE, (kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!"); + ASSERT((kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!"); kmax = kmin + kplus; kplus = 0; /* If kplus != 0 detected further on, that indicates that no valid restart file was found for factoring-bounds incrementing. */ @@ -1410,18 +1410,18 @@ exit(0); if(passmin > (TF_PASSES-1) ) { fprintf(stderr,"ERROR: passmin must be <= %u. Offending entry = %u.\n", TF_PASSES-1, passmin); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } if(passmax < passmin) { fprintf(stderr,"ERROR: (passmax = %u) < (passmin = %u)!\n", passmax, passmin); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } if(passmax > (TF_PASSES-1) ) { fprintf(stderr,"ERROR: passmax must be <= %u. Offending entry = %u.\n", TF_PASSES-1, passmax); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /**** Process factor candidate bounds: ****/ @@ -1433,16 +1433,16 @@ exit(0); #endif /* Compute kmax if not already set: */ if(!kmax) { - ASSERT(HERE, bmax <= (nbits_in_p+65), "Specified bmax implies kmax > 64-bit, which exceeds the program's limit ... aborting."); + ASSERT(bmax <= (nbits_in_p+65), "Specified bmax implies kmax > 64-bit, which exceeds the program's limit ... aborting."); kmax = given_b_get_k(bmax, two_p, lenQ); - ASSERT(HERE, kmax > 0, "Something went wrong with the computation of kmax ... possibly your bmax implies kmax > 64-bit?"); + ASSERT(kmax > 0, "Something went wrong with the computation of kmax ... possibly your bmax implies kmax > 64-bit?"); } if(kmin || bmin) { if(kmin == 0ull) { /* Lower Bound given in log2rithmic form */ - ASSERT(HERE, bmin <= bmax, "bmin >= bmax!"); + ASSERT(bmin <= bmax, "bmin >= bmax!"); kmin = given_b_get_k(bmin, two_p, lenQ); } else { - ASSERT(HERE, kmin <= kmax, "kmin >= kmax!"); + ASSERT(kmin <= kmax, "kmin >= kmax!"); #ifdef P1WORD fqlo = kmin*twop_float + 1.0; bmin = log(fqlo)*ILG2; @@ -1453,7 +1453,7 @@ exit(0); fqlo = 1.0; #endif } -ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!"); +ASSERT(0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, passmin,passnow,passmax, count),"init_savefile failed!"); //**** Do savefile-init here? ****** if(kmax || bmax) { if(kmax == 0ull) { /* Upper Bound given in log2rithmic form */ @@ -1465,14 +1465,14 @@ ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, #endif } } else - ASSERT(HERE, 0 ,"factor.c : One of bmax, kmax must be nonzero!"); + ASSERT(0 ,"factor.c : One of bmax, kmax must be nonzero!"); /**** At this point the paired elements bmin|kmin, bmax|kmax are in synchrony. ****/ /* If kplus given on command line, a valid restart file should have been found and kmax incremented at this point, i.e. kplus should have been reset to zero: */ - ASSERT(HERE, kplus == 0, "kplus must be zero here!"); + ASSERT(kplus == 0, "kplus must be zero here!"); know = kmin; passnow = passmin; @@ -1483,7 +1483,7 @@ ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, /****************** SIEVE STUFF: *********************/ /*****************************************************/ - ASSERT(HERE, NUM_SIEVING_PRIME > 0, "factor.c : NUM_SIEVING_PRIME > 0"); + ASSERT(NUM_SIEVING_PRIME > 0, "factor.c : NUM_SIEVING_PRIME > 0"); /* allocate the arrays and initialize the array of sieving primes */ temp_late = (uint64 *)calloc(len, sizeof(uint64)); @@ -1498,12 +1498,12 @@ ASSERT(HERE,0 == init_savefile(RESTARTFILE, pstring, bmin,bmax, kmin,know,kmax, bit_map2= (uint64 *)calloc(i * NTHREADS, sizeof(uint64)); // 2nd alloc to give each thread 1 bit-clearable copy of master bit_map if (bit_map == NULL) { fprintf(stderr,"Memory allocation failure for BITMAP array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } bit_atlas = (uint64 *)calloc(i * TF_PASSES, sizeof(uint64)); if (bit_atlas == NULL) { fprintf(stderr,"Memory allocation failure for TEMPLATE array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that in bit_atlas]\n",len,i,TF_PASSES); @@ -1511,51 +1511,51 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that psmall = (uint32 *)calloc(NUM_SIEVING_PRIME * NTHREADS, sizeof(uint32)); if (psmall == NULL) { fprintf(stderr,"Memory allocation failure for PSMALL array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif pdiff = (uint8 *)calloc(NUM_SIEVING_PRIME * NTHREADS, sizeof(uint8)); if (pdiff == NULL) { fprintf(stderr,"Memory allocation failure for pdiff array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } startval = (uint32 *)calloc(NUM_SIEVING_PRIME * NTHREADS, sizeof(uint32)); if (startval == NULL) { fprintf(stderr,"Memory allocation failure for STARTVAL array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } pinv = (uint32 *)calloc(NUM_SIEVING_PRIME, sizeof(uint32)); if (pinv == NULL) { fprintf(stderr,"Memory allocation failure for PINV array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #if DBG_SIEVE startval_incr = (uint32 *)calloc(NUM_SIEVING_PRIME, sizeof(uint32)); if (startval_incr == NULL) { fprintf(stderr,"Memory allocation failure for STARTVAL_INCR array"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif /* Check integrity (at least in the sense of monotonicity) for the precomputed pseudoprime table: */ for(i = 1; i < 9366; ++i) { - ASSERT(HERE, f2psp[i] > f2psp[i-1],"Misplaced pseudoprime!"); + ASSERT(f2psp[i] > f2psp[i-1],"Misplaced pseudoprime!"); } /* Test some near-2^32 known-prime cases: */ curr_p = (uint32)-5; itmp32 = twopmodq32(curr_p-1, curr_p); - ASSERT(HERE, itmp32 == 1,"twopmodq32: 2^32 - 5 test fails!"); + ASSERT(itmp32 == 1,"twopmodq32: 2^32 - 5 test fails!"); curr_p = (uint32)-17; itmp32 = twopmodq32(curr_p-1, curr_p); - ASSERT(HERE, itmp32 == 1,"twopmodq32: 2^32 -17 test fails!"); + ASSERT(itmp32 == 1,"twopmodq32: 2^32 -17 test fails!"); curr_p = (uint32)-35; /* Start of the last length-30 curr_p%30 == 11 interval < 2^32; the 6th candidate in that interval, 2^32-17, is prime */ itmp32 = twopmodq32_x8(curr_p, curr_p+ 2, curr_p+ 6, curr_p+ 8, curr_p+12, curr_p+18, curr_p+20, curr_p+26); - ASSERT(HERE, itmp32 ==32,"twopmodq32_x8: 2^32 -35 test fails!"); + ASSERT(itmp32 ==32,"twopmodq32_x8: 2^32 -35 test fails!"); fprintf(stderr,"Generating difference table of first %u small primes\n", nprime); curr_p = 3; /* Current prime stored in l. */ @@ -1596,7 +1596,7 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that { if((itmp32 >> j)&0x1) // It's a PRP, so check against the table of known pseudoprimes and { // (if it's not a PSP) init for the next gap - ASSERT(HERE, curr_p <= f2psp[f2psp_idx],"Error in pseudoprime sieve"); + ASSERT(curr_p <= f2psp[f2psp_idx],"Error in pseudoprime sieve"); if((curr_p + pdsum_8[j]) == f2psp[f2psp_idx]) /* It's a base-2 pseudoprime */ { ++f2psp_idx; @@ -1645,13 +1645,13 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that #if 0 // Oct 2015: Play with Smarandache numbers (): - i = 2000000; ASSERT(HERE, i <= nprime, "prime limit exceeded in testSmarandache!"); + i = 2000000; ASSERT(i <= nprime, "prime limit exceeded in testSmarandache!"); testSmarandache(100001,101000, pdiff, i); exit(0); #endif #if 0 // Oct 2018: Play with "sieve survivors" stats: lim(n --> oo) prod_(p <= n)(1-1/p)/(1/ln(p^2)) - i = 1000000000; ASSERT(HERE, i <= MAX_SIEVING_PRIME, "prime limit exceeded in testSieveProdAsymp!"); + i = 1000000000; ASSERT(i <= MAX_SIEVING_PRIME, "prime limit exceeded in testSieveProdAsymp!"); struct qfloat qfprod = QHALF, qt; double prod = 0.5, log_psq = log((double)i*i); for(m = 0, curr_p = 3; m < nprime; m++) { @@ -1766,7 +1766,7 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that if(p > curr_p) break; curr_p -= (pdiff[nprime--] << 1); #ifdef FAC_DEBUG - ASSERT(HERE, curr_p == prime[nprime], "factor.c : curr_p == prime[nprime]"); + ASSERT(curr_p == prime[nprime], "factor.c : curr_p == prime[nprime]"); #endif } MAX_SIEVING_PRIME = curr_p; @@ -1792,20 +1792,20 @@ printf("Allocated %u words in master template, %u in per-pass bit_map [%u x that #else // 4620 classes: pass_targ = CHECK_PKMOD4620(p,lenP, k_targ, 0x0) - 1; #endif - ASSERT(HERE, (pass_targ < TF_PASSES), "Candidate factor set via k_targ is not a possible factor for this exponent!"); + ASSERT((pass_targ < TF_PASSES), "Candidate factor set via k_targ is not a possible factor for this exponent!"); printf("Target pass for debug-factor = %u\n",pass_targ); } #endif itmp64 = (uint64)mi64_div_y32(p,TF_CLASSES,0x0,lenP); -// printf("p %% 60 = %llu\n",itmp64); +// printf("p %% 60 = %" PRIu64 "\n",itmp64); #if TF_CLASSES == 60 /* const int pmod_vec[] = { 1, 7,11,13,17,19,23,29,31,37,41,43,47,49,53,59, 2,4,8,16,32, 0x0}; for(i = 0; pmod_vec[i] != 0; i++) { - ASSERT(HERE, CHECK_PKMOD60(pmod_vec[i], k, incr) == 16, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES = 16!\n"); + ASSERT(CHECK_PKMOD60(pmod_vec[i], k, incr) == 16, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES = 16!\n"); } exit(0); Mersenne Mp: Acceptable km-values for the 16 possible pm (= p%60) values: @@ -1833,7 +1833,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: pm = 32: 0, 4,10,12,18,22,24,28,30,34,40,42,48,52,54,58 */ i = CHECK_PKMOD60 (&itmp64,1, k, incr); - ASSERT(HERE, i == TF_PASSES, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[1,7,11,13,17,19,23,29,31,37,41,43,47,49,53,59] (mod 60).\n"); + ASSERT(i == TF_PASSES, "CHECK_PKMOD60 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[1,7,11,13,17,19,23,29,31,37,41,43,47,49,53,59] (mod 60).\n"); /* printf("k mod 60 = ["); for(i = 0, j = 0; i < 16; i++) { @@ -1845,7 +1845,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: */ #else // 4620 classes: i = CHECK_PKMOD4620(&itmp64,1, k, incr); - ASSERT(HERE, i == TF_PASSES, "CHECK_PKMOD4620 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[960 possible values] (mod 4620).\n"); + ASSERT(i == TF_PASSES, "CHECK_PKMOD4620 returns something other than the expected #TF_PASSES! Exponent not of the required form (odd prime or odd composite == any_of[960 possible values] (mod 4620).\n"); #endif /* If it's a restart, interval_lo for the initial pass will be based @@ -1893,7 +1893,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: #ifdef FAC_DEBUG /* Make sure the range of k's for the run contains any target factor: */ if(k_targ) - ASSERT(HERE, (kmin <= k_targ) && (kmax >= k_targ),"k_targ not in [kmin, kmax]"); + ASSERT((kmin <= k_targ) && (kmax >= k_targ),"k_targ not in [kmin, kmax]"); #endif #ifdef FACTOR_STANDALONE @@ -2018,7 +2018,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: } /* Should never reach this regular-loop-exit point: */ fprintf(stderr,"ERROR: failed to find a multiple of prime %u\n", curr_p); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); KLOOP: /* Propagate copies of length (regs_todo) bit-cleared portion of sieve to remaining parts of sieve. @@ -2054,7 +2054,7 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: #ifdef FACTOR_STANDALONE printf( "TRYQ = %u, max sieving prime = %u\n",TRYQ,MAX_SIEVING_PRIME); #else - ASSERT(HERE, fp == 0x0,"0"); + ASSERT(fp == 0x0,"0"); fp = mlucas_fopen(STATFILE,"a"); fprintf(fp,"TRYQ = %u, max sieving prime = %u\n",TRYQ,MAX_SIEVING_PRIME); fclose(fp); fp = 0x0; @@ -2121,11 +2121,11 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: } /* end of K loop */ //printf("L3: template word %u [used %u copies] bit_atlas chart %u, word %u, bit %u\n",(uint32)k,ncopies,l,word,bit); exit(0); // For 60|4620 classes expect to end at bit 15|63 of the last word of each of the TF_PASSES = 16|960 sievelets (a.k.a. charts in our atlas): - ASSERT(HERE, (k == 0) && (l == 0), "bit_atlas init: Exit check 1 failed!"); + ASSERT((k == 0) && (l == 0), "bit_atlas init: Exit check 1 failed!"); #if TF_CLASSES == 60 - ASSERT(HERE, (word == 4254) && (bit == 15), "bit_atlas init: Exit check 2 failed!"); + ASSERT((word == 4254) && (bit == 15), "bit_atlas init: Exit check 2 failed!"); #else // 4620 classes: - ASSERT(HERE, (word == 3535) && (bit == 63), "bit_atlas init: Exit check 2 failed!"); + ASSERT((word == 3535) && (bit == 63), "bit_atlas init: Exit check 2 failed!"); #endif #ifdef FAC_DEBUG @@ -2157,27 +2157,27 @@ Fermat Fn (n > 0): 0,Acceptable km-values for the ? possible pm (= p%60) values: switch(pmodNC) { /* p mod 12 = 1: */ - case 1:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case 1"); break; /* k mod 5 .ne. 2 */ - case 37:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==48&&incr[i++]==59&&incr[i++]==60, "factor.c : case 37"); break; /* k mod 5 .ne. 1 */ - case 13:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==47&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==60, "factor.c : case 13"); break; /* k mod 5 .ne. 4 */ - case 49:ASSERT(HERE, incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case 49"); break; /* k mod 5 .ne. 3 */ + case 1:ASSERT(incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case 1"); break; /* k mod 5 .ne. 2 */ + case 37:ASSERT(incr[i++]== 3&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==48&&incr[i++]==59&&incr[i++]==60, "factor.c : case 37"); break; /* k mod 5 .ne. 1 */ + case 13:ASSERT(incr[i++]== 3&&incr[i++]== 8&&incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==23&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==47&&incr[i++]==48&&incr[i++]==51&&incr[i++]==56&&incr[i++]==60, "factor.c : case 13"); break; /* k mod 5 .ne. 4 */ + case 49:ASSERT(incr[i++]==11&&incr[i++]==12&&incr[i++]==15&&incr[i++]==20&&incr[i++]==24&&incr[i++]==27&&incr[i++]==32&&incr[i++]==35&&incr[i++]==36&&incr[i++]==39&&incr[i++]==44&&incr[i++]==47&&incr[i++]==51&&incr[i++]==56&&incr[i++]==59&&incr[i++]==60, "factor.c : case 49"); break; /* k mod 5 .ne. 3 */ /* p mod 12 == 7: */ - case 31:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==60, "factor.c : case 31"); break; /* k mod 5 .ne. 2 */ - case 7:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==33&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==57&&incr[i++]==60, "factor.c : case 7"); break; /* k mod 5 .ne. 1 */ - case 43:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==32&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 43"); break; /* k mod 5 .ne. 4 */ - case 19:ASSERT(HERE, incr[i++]== 5&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 19"); break; /* k mod 5 .ne. 3 */ + case 31:ASSERT(incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==60, "factor.c : case 31"); break; /* k mod 5 .ne. 2 */ + case 7:ASSERT(incr[i++]== 5&&incr[i++]== 8&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==33&&incr[i++]==44&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==57&&incr[i++]==60, "factor.c : case 7"); break; /* k mod 5 .ne. 1 */ + case 43:ASSERT(incr[i++]== 5&&incr[i++]== 8&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==32&&incr[i++]==33&&incr[i++]==36&&incr[i++]==41&&incr[i++]==45&&incr[i++]==48&&incr[i++]==53&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 43"); break; /* k mod 5 .ne. 4 */ + case 19:ASSERT(incr[i++]== 5&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==17&&incr[i++]==20&&incr[i++]==21&&incr[i++]==24&&incr[i++]==29&&incr[i++]==32&&incr[i++]==36&&incr[i++]==41&&incr[i++]==44&&incr[i++]==45&&incr[i++]==56&&incr[i++]==57&&incr[i++]==60, "factor.c : case 19"); break; /* k mod 5 .ne. 3 */ /* p mod 12 == 5: */ - case 41:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 4&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==55&&incr[i++]==60, "factor.c : case 41"); break; /* k mod 5 .ne. 2 */ - case 17:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==28&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 17"); break; /* k mod 5 .ne. 1 */ - case 53:ASSERT(HERE, incr[i++]== 3&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==27&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 53"); break; /* k mod 5 .ne. 4 */ - case 29:ASSERT(HERE, incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 29"); break; /* k mod 5 .ne. 3 */ + case 41:ASSERT(incr[i++]== 3&&incr[i++]== 4&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==55&&incr[i++]==60, "factor.c : case 41"); break; /* k mod 5 .ne. 2 */ + case 17:ASSERT(incr[i++]== 3&&incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==28&&incr[i++]==39&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 17"); break; /* k mod 5 .ne. 1 */ + case 53:ASSERT(incr[i++]== 3&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==27&&incr[i++]==28&&incr[i++]==31&&incr[i++]==36&&incr[i++]==40&&incr[i++]==43&&incr[i++]==48&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 53"); break; /* k mod 5 .ne. 4 */ + case 29:ASSERT(incr[i++]== 4&&incr[i++]== 7&&incr[i++]==12&&incr[i++]==15&&incr[i++]==16&&incr[i++]==19&&incr[i++]==24&&incr[i++]==27&&incr[i++]==31&&incr[i++]==36&&incr[i++]==39&&incr[i++]==40&&incr[i++]==51&&incr[i++]==52&&incr[i++]==55&&incr[i++]==60, "factor.c : case 29"); break; /* k mod 5 .ne. 3 */ /* p mod 12 == 11: */ - case 11:ASSERT(HERE, incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==60, "factor.c : case 11"); break; /* k mod 5 .ne. 2 */ - case 47:ASSERT(HERE, incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==13&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 47"); break; /* k mod 5 .ne. 1 */ - case 23:ASSERT(HERE, incr[i++]== 1&&incr[i++]==12&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 23"); break; /* k mod 5 .ne. 4 */ - case 59:ASSERT(HERE, incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 59"); break; /* k mod 5 .ne. 3 */ + case 11:ASSERT(incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==60, "factor.c : case 11"); break; /* k mod 5 .ne. 2 */ + case 47:ASSERT(incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==13&&incr[i++]==24&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 47"); break; /* k mod 5 .ne. 1 */ + case 23:ASSERT(incr[i++]== 1&&incr[i++]==12&&incr[i++]==13&&incr[i++]==16&&incr[i++]==21&&incr[i++]==25&&incr[i++]==28&&incr[i++]==33&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==48&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 23"); break; /* k mod 5 .ne. 4 */ + case 59:ASSERT(incr[i++]== 1&&incr[i++]== 4&&incr[i++]== 9&&incr[i++]==12&&incr[i++]==16&&incr[i++]==21&&incr[i++]==24&&incr[i++]==25&&incr[i++]==36&&incr[i++]==37&&incr[i++]==40&&incr[i++]==45&&incr[i++]==49&&incr[i++]==52&&incr[i++]==57&&incr[i++]==60, "factor.c : case 59"); break; /* k mod 5 .ne. 3 */ default: - ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT,"Only Mersenne and fermat-number factoring supported!"); + ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT,"Only Mersenne and fermat-number factoring supported!"); } #endif @@ -2266,10 +2266,10 @@ candidate factors that survive sieving. */ /* Starting no.-of-times-through-sieve = kmin/(64*len) : */ if(pass == passnow && (know > kmin)) { interval_lo = know/((uint64)len << TF_CLSHIFT); - ASSERT(HERE, know == interval_lo *(len << TF_CLSHIFT),"know == interval_lo*(len << TF_CLSHIFT)"); + ASSERT(know == interval_lo *(len << TF_CLSHIFT),"know == interval_lo*(len << TF_CLSHIFT)"); } else { interval_lo = kmin/((uint64)len << TF_CLSHIFT); - ASSERT(HERE, kmin == interval_lo *(len << TF_CLSHIFT),"kmin == interval_lo*(len << TF_CLSHIFT)"); + ASSERT(kmin == interval_lo *(len << TF_CLSHIFT),"kmin == interval_lo*(len << TF_CLSHIFT)"); } } else { interval_lo = interval_hi; // This is what defines a 'no-op' pool task. @@ -2277,9 +2277,9 @@ candidate factors that survive sieving. */ /* Set initial k for this pass to default value (= incr[pass]) + interval_lo*(64*len), (assume this could be as large as 64 bits), then use it to set initial q for this pass: */ - ASSERT(HERE, (double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT"); + ASSERT((double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT"); k = (uint64)incr[pass] + interval_lo*(len << TF_CLSHIFT); - // fprintf(stderr," [*** Init pass %u data: k0 = %llu, word0 = %16llX\n",pass,k,bit_map[0]); + // fprintf(stderr," [*** Init pass %u data: k0 = %" PRIu64 ", word0 = %16" PRIX64 "\n",pass,k,bit_map[0]); struct fac_thread_data_t* targ = tdat + thr_id; targ->count = &count; targ->tid = thr_id; // Within the per-thread TFing, only the pool-thread ID matters @@ -2344,14 +2344,14 @@ candidate factors that survive sieving. */ #if 0 printf("adding pool task %d with pool ID [%d]\n",thr_id,((struct thread_init *)(&task_control)->data)->thread_num); struct fac_thread_data_t* targ = tdat + thr_id; - printf("This task has: pass %u, interval_[lo,hi] = [%llu,%llu]\n",targ->pass,targ->interval_lo,targ->interval_hi); + printf("This task has: pass %u, interval_[lo,hi] = [%" PRIu64 ",%" PRIu64 "]\n",targ->pass,targ->interval_lo,targ->interval_hi); printf("; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); #endif } while(tpool->free_tasks_queue.num_tasks != NTHREADS) { // Posix sleep() too granular here; use finer-resolution, declared in ; cf. http://linux.die.net/man/2/nanosleep - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } fprintf(stderr,"\n"); // For pretty-printing, have the inline-pass-printing reflect || work, newlines reflect sync-points }; // wave-loop @@ -2374,7 +2374,7 @@ candidate factors that survive sieving. */ /* If debugging sieve, make sure critical bit hasn't been cleared: */ if( k_targ && (((bit_map[i64_targ] >> bit_targ) & 1) == 0) ) { fprintf(stderr,"Critical bit cleared in master bitmap!\n"); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif @@ -2383,7 +2383,7 @@ candidate factors that survive sieving. */ printf("pass = %u",pass); fflush(stdout); } #else - ASSERT(HERE, fp == 0x0,"0"); + ASSERT(fp == 0x0,"0"); fp = mlucas_fopen(STATFILE,"a"); fprintf(fp,"Starting Trial-factoring Pass %2u...\n",pass); fclose(fp); fp = 0x0; @@ -2392,16 +2392,16 @@ candidate factors that survive sieving. */ /* Starting no.-of-times-through-sieve = kmin/(64*len) : */ if(pass == passnow && (know > kmin)) { interval_lo = know/((uint64)len << TF_CLSHIFT); - ASSERT(HERE, know == interval_lo*((uint64)len << TF_CLSHIFT),"know == interval_lo*((uint64)len << TF_CLSHIFT)"); + ASSERT(know == interval_lo*((uint64)len << TF_CLSHIFT),"know == interval_lo*((uint64)len << TF_CLSHIFT)"); } else { interval_lo = kmin/((uint64)len << TF_CLSHIFT); - ASSERT(HERE, kmin == interval_lo*((uint64)len << TF_CLSHIFT),"kmin == interval_lo*((uint64)len << TF_CLSHIFT)"); + ASSERT(kmin == interval_lo*((uint64)len << TF_CLSHIFT),"kmin == interval_lo*((uint64)len << TF_CLSHIFT)"); } /* Set initial k for this pass to default value (= incr[pass]) + interval_lo*(64*len), (assume this could be as large as 64 bits), then use it to set initial q for this pass: */ - ASSERT(HERE, (double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT"); + ASSERT((double)interval_lo*(len << TF_CLSHIFT) < TWO64FLOAT, "(double)interval_lo*len < TWO64FLOAT"); k = (uint64)incr[pass] + interval_lo*(len << TF_CLSHIFT); i = nprime; // Remember, MAX_SIEVING_PRIME is a *variable* and set at runtime, as opposed to the predef NUM_SIEVING_PRIME; @@ -2439,7 +2439,7 @@ candidate factors that survive sieving. */ if(cudaError != cudaSuccess) { printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError)); - ASSERT(HERE, 0, "factor.c : GPU-side error detected!"); + ASSERT(0, "factor.c : GPU-side error detected!"); } #endif @@ -2465,14 +2465,14 @@ candidate factors that survive sieving. */ #ifdef FACTOR_STANDALONE if(!restart) { - printf( "%s(%s) has %u factors in range k = [%llu, %llu], passes %u-%u\n", + printf( "%s(%s) has %u factors in range k = [%" PRIu64 ", %" PRIu64 "], passes %u-%u\n", NUM_PREFIX[MODULUS_TYPE], pstring, nfactor, kmin, kmax, passmin, passmax); printf( "Performed %s trial divides\n", &char_buf0[convert_uint64_base10_char(char_buf0, count)]); /* Since we're done accumulating cycle count, divide to get total time in seconds: */ printf( "Clocks =%s\n",get_time_str(tdiff)); } #else - ASSERT(HERE, fp == 0x0,"0"); + ASSERT(fp == 0x0,"0"); fp = mlucas_fopen(STATFILE,"a"); fprintf(fp,"Performed %s trial divides\n", &char_buf0[convert_uint64_base10_char(char_buf0, count)]); /* Since we're done accumulating cycle count, divide to get total time in seconds: */ @@ -2482,9 +2482,9 @@ candidate factors that survive sieving. */ fp = mlucas_fopen( OFILE,"a"); #ifdef P1WORD - fprintf(fp,"M(%s) has %u factors in range k = [%llu, %llu], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax); + fprintf(fp,"M(%s) has %u factors in range k = [%" PRIu64 ", %" PRIu64 "], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax); #else - fprintf(fp,"M(%s) has %u factors in range k = [%llu, %llu], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax); + fprintf(fp,"M(%s) has %u factors in range k = [%" PRIu64 ", %" PRIu64 "], passes %u-%u\n", pstring, nfactor, kmin, kmax, passmin, passmax); #endif fclose(fp); fp = 0x0; @@ -2492,7 +2492,7 @@ candidate factors that survive sieving. */ /* If a test factor was given, make sure we found at least one factor: */ if(k_targ > 0) { - ASSERT(HERE, nfactor > 0,"k_targ > 0 but failed to find at least one factor"); + ASSERT(nfactor > 0,"k_targ > 0 but failed to find at least one factor"); } #endif @@ -2566,7 +2566,7 @@ candidate factors that survive sieving. */ #endif /* If we reached here other than via explicit invocation of the help menu, assert: */ if(!STREQ(stFlag, "-h")) - ASSERT(HERE, 0,"Mfactor: Unrecognized command-line option!"); + ASSERT(0,"Mfactor: Unrecognized command-line option!"); return(0); #endif } @@ -2686,7 +2686,7 @@ candidate factors that survive sieving. */ #ifdef P4WORD uint256 p256,q256,t256; #endif - char cbuf[STR_MAX_LEN], cbuf2[STR_MAX_LEN]; + char cbuf[STR_MAX_LEN*2], cbuf2[STR_MAX_LEN*2]; #ifdef CTIME clock_t clock1, clock2; #else // Multithreaded needs wall-clock, not CPU time: @@ -2709,27 +2709,27 @@ candidate factors that survive sieving. */ itmp = fscanf(fp,"%s\n",cstr); if(itmp <= 0 || !STREQ(cstr,pstring)) { sprintf(char_buf0,"Line 1 entry found in factoring savefile [%s] does not match exponent of run [%s].",cstr,pstring); - ASSERT(HERE,0,char_buf0); + ASSERT(0,char_buf0); } itmp = fscanf(fp,"%u\n",&i ); if(itmp <= 0 || i != TF_PASSES ) { sprintf(char_buf0,"Line 1 entry found in factoring savefile [%d] does not match exponent of run [%d].",i,TF_PASSES); - ASSERT(HERE,0,char_buf0); + ASSERT(0,char_buf0); } // See if restart file has a pass/max-k-reached entry matching the current pass: while(fgets(cstr,STR_MAX_LEN,fp)) { if((char_addr = strstr(cstr,"Pass ")) != 0) { itmp = sscanf(char_addr,"%u",i); if(itmp <= 0) { - fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(0,"0"); } if(i == pass) { // Is the pass index the one we are updating? If yes, update the k-value - ASSERT(HERE, !found_pass, "Multiple current-pass entry found in savefile!"); + ASSERT(!found_pass, "Multiple current-pass entry found in savefile!"); found_pass = TRUE; // Read the max-k-reached value - ASSERT(HERE,((char_addr = strstr(cstr,"Pass ")) != 0),"Expected : following pass number not found!"); - itmp = sscanf(char_addr,"%llu",k); - ASSERT(HERE,itmp >= 0,"Unable to read max-k-reached value!"); + ASSERT(((char_addr = strstr(cstr,"Pass ")) != 0),"Expected : following pass number not found!"); + itmp = sscanf(char_addr,"%" PRIu64,k); + ASSERT(itmp >= 0,"Unable to read max-k-reached value!"); // Even if valid entry found, process rest of file to ensure no duplicate-pass-number entries } } @@ -2737,7 +2737,7 @@ candidate factors that survive sieving. */ /* pstring*/ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (current exponent) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (current exponent) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } /* Strip the expected newline char from in_line: */ char_addr = strstr(in_line, "\n"); @@ -2745,23 +2745,23 @@ candidate factors that survive sieving. */ *char_addr = '\0'; /* Make sure restart-file and current-run pstring match: */ if(STRNEQ(in_line, pstring)) { - fprintf(stderr,"ERROR: current exponent %s != Line %d of factoring restart file %s!\n",pstring, curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: current exponent %s != Line %d of factoring restart file %s!\n",pstring, curr_line, RESTARTFILE); ASSERT(0,"0"); } /* bmin */ ++curr_line; - fgets(cbuf, STR_MAX_LEN, fp); + fgets(cbuf, STR_MAX_LEN*2, fp); itmp = sscanf(cbuf, "%lf", &bmin_file); if(itmp != 1) { - fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf); ASSERT(0,"0"); } /* bmax */ ++curr_line; - fgets(cbuf, STR_MAX_LEN, fp); + fgets(cbuf, STR_MAX_LEN*2, fp); itmp = sscanf(cbuf, "%lf", &bmax_file); if(itmp != 1) { - fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to parse Line %d (bmin) of factoring restart file %s. Offending input = %s\n", curr_line, RESTARTFILE, cbuf); ASSERT(0,"0"); } /************************************ @@ -2776,7 +2776,7 @@ candidate factors that survive sieving. */ GET_LINE4: /**** redo this ****/ if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: 'KMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: 'KMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "KMin"); /* Since the preceding fscanf call may leave us at the end of curr_line-1 @@ -2787,7 +2787,7 @@ candidate factors that survive sieving. */ } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; kmin_file = convert_base10_char_uint64(char_addr); @@ -2796,15 +2796,15 @@ candidate factors that survive sieving. */ /* KNow */ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (KNow) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (KNow) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "KNow"); if(!char_addr) { - fprintf(stderr,"ERROR: 'KNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: 'KNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; know_file = convert_base10_char_uint64(char_addr); @@ -2813,15 +2813,15 @@ candidate factors that survive sieving. */ /* KMax */ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (KMax) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (KMax) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "KMax"); if(!char_addr) { - fprintf(stderr,"ERROR: 'KMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: 'KMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; kmax_file = convert_base10_char_uint64(char_addr); @@ -2830,71 +2830,71 @@ candidate factors that survive sieving. */ /* PassMin */ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (PassMin) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (PassMin) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "PassMin"); if(!char_addr) { - fprintf(stderr,"ERROR: 'PassMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: 'PassMin' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; passmin_file = (uint32)convert_base10_char_uint64(char_addr); - ASSERT(HERE, passmin_file < TF_PASSES,"factor.c: passmin < TF_PASSES"); + ASSERT(passmin_file < TF_PASSES,"factor.c: passmin < TF_PASSES"); } /* PassNow */ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (PassNow) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (PassNow) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "PassNow"); if(!char_addr) { - fprintf(stderr,"ERROR: 'PassNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: 'PassNow' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; passnow_file = (uint32)convert_base10_char_uint64(char_addr); - ASSERT(HERE, passnow_file < TF_PASSES,"factor.c: passnow < TF_PASSES"); - ASSERT(HERE, passnow_file >= passmin_file ,"factor.c: passnow_file >= passmin_file"); + ASSERT(passnow_file < TF_PASSES,"factor.c: passnow < TF_PASSES"); + ASSERT(passnow_file >= passmin_file ,"factor.c: passnow_file >= passmin_file"); } /* PassMax */ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (PassMax) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (PassMax) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "PassMax"); if(!char_addr) { - fprintf(stderr,"ERROR: 'PassMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: 'PassMax' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; passmax_file = (uint32)convert_base10_char_uint64(char_addr); - ASSERT(HERE, passmax_file < TF_PASSES,"factor.c: passmax_file < TF_PASSES"); - ASSERT(HERE, passmax_file >= passnow_file ,"factor.c: passmax_file >= passnow_file"); + ASSERT(passmax_file < TF_PASSES,"factor.c: passmax_file < TF_PASSES"); + ASSERT(passmax_file >= passnow_file ,"factor.c: passmax_file >= passnow_file"); } /* Number of q's tried: */ ++curr_line; if(!fgets(in_line, STR_MAX_LEN, fp)) { - fprintf(stderr,"ERROR: unable to read Line %d (#Q tried) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read Line %d (#Q tried) of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr = strstr(in_line, "#Q tried"); if(!char_addr) { - fprintf(stderr,"ERROR: '#Q tried' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: '#Q tried' not found in Line %d of factoring restart file %s!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } else { char_addr = strstr(in_line, "="); if(!char_addr) { - fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: Line %d of factoring restart file %s lacks the required = sign!\n", curr_line, RESTARTFILE); ASSERT(0,"0"); } char_addr++; count = convert_base10_char_uint64(char_addr); // Need to reset == 0 prior to sieving so kvector-fill code works properly @@ -2939,12 +2939,12 @@ candidate factors that survive sieving. */ if(bmin || bmax) { #if(!defined(P1WORD)) - // ASSERT(HERE, 0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!"); + // ASSERT(0,"bmin/bmax form of bounds-setting only allowed for single-word-p case!"); #endif - ASSERT(HERE, (kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run."); + ASSERT((kmin==0 && kmax==0 && kplus==0),"(kmin==0 && kmax==0 && kplus==0) - please delete any restart files for this p and retry debug run."); if(bmin) { - ASSERT(HERE, bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file"); + ASSERT(bmin >= bmin_file - 0.0000000001,"bmin >= bmin_file"); if(bmin < bmax_file) fprintf(stderr,"WARNING: Specified bmin (%lf) smaller than previous-run bmax = %lf. Setting equal to avoid overlapping runs.\n", bmin, bmax_file); } @@ -2952,7 +2952,7 @@ candidate factors that survive sieving. */ /* We expect any command-line bmax will be > that in the restart file: */ if(bmax) - ASSERT(HERE, bmax > bmax_file - 0.0000000001,"bmax >= bmax_file"); + ASSERT(bmax > bmax_file - 0.0000000001,"bmax >= bmax_file"); } /**** @@ -2963,10 +2963,10 @@ candidate factors that survive sieving. */ ****/ if(kmin || kmax) { - ASSERT(HERE, (bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)"); + ASSERT((bmin==0 && bmax==0 && kplus==0),"(bmin==0 && bmax==0 && kplus==0)"); if(kmin) { - ASSERT(HERE, kmin >= kmin_file,"kmin >= kmin_file"); + ASSERT(kmin >= kmin_file,"kmin >= kmin_file"); if(kmin < kmax_file) fprintf(stderr,"WARNING: Specified kmin (%s) smaller than previous-run kmax = %s. Setting equal to avoid overlapping runs.\n", &char_buf0[convert_uint64_base10_char(char_buf0, kmax)], &char_buf1[convert_uint64_base10_char(char_buf1, kmax_file)]); } @@ -2974,7 +2974,7 @@ candidate factors that survive sieving. */ /* We expect any command-line kmax will be > that in the restart file: */ if(kmax) - ASSERT(HERE, kmax > kmax_file,"kmax >= kmax_file"); + ASSERT(kmax > kmax_file,"kmax >= kmax_file"); } /**** @@ -2982,11 +2982,11 @@ candidate factors that survive sieving. */ ****/ if(kplus) { - ASSERT(HERE, (bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)"); + ASSERT((bmin==0 && bmax==0 && kmin==0 && kmax==0),"(bmin==0 && bmax==0 && kmin==0 && kmax==0)"); kmin = kmax_file; /* Ensure incremented value kmax fits into a 64-bit unsigned int: */ - ASSERT(HERE, (kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!"); + ASSERT((kmin + kplus) > kplus, "kmax_file + kplus exceeds 2^64!"); kmax = kmin + kplus; kplus = 0; /* If kplus != 0 detected further on, that indicates that no valid restart file was found for factoring-bounds incrementing. */ @@ -3016,12 +3016,12 @@ candidate factors that survive sieving. */ #ifdef FAC_DEBUG // compute qstart = 2.kstart.p + 1: - ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); + ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even printf(" Initial q for this pass = %s.\n", &char_buf0[convert_mi64_base10_char(char_buf0, q, lenQ, 0)]); #endif //if(pass==4) -// printf("\nPass %u: k0 = %u, word0 prior to deep-prime clearing = %16llX\n",pass,(uint32)kstart,bit_map[0]); +// printf("\nPass %u: k0 = %u, word0 prior to deep-prime clearing = %16" PRIX64 "\n",pass,(uint32)kstart,bit_map[0]); // Compute startbit k (occurrence of first multiple of prime curr_p in first pass through the relevant sievelet: if((lenP == 1) && (p[0] <= MAX_SIEVING_PRIME)) get_startval(MODULUS_TYPE, p[0], findex, two_p, lenQ, bit_len, interval_lo, incr, nclear, nprime, p_last_small, pdiff, startval); @@ -3033,7 +3033,7 @@ candidate factors that survive sieving. */ #ifdef MULTITHREAD //if(tid == 0) #endif -// printf("sweep %llu: k0 = %llu, count %llu: k0-3 = %llu,%llu,%llu,%llu\n",sweep,kstart,count,k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]); +// printf("sweep %" PRIu64 ": k0 = %" PRIu64 ", count %" PRIu64 ": k0-3 = %" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",sweep,kstart,count,k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]); /* Accumulate the cycle count every so often to avoid problems with integer overflow of the clock() result, if clock_t happens to be a 32-bit int type on the host platform: @@ -3068,7 +3068,7 @@ candidate factors that survive sieving. */ fbits_in_k = log((double)k + TF_CLASSES*bit_len)*ILG2; // Use k-value at end of upcoming pass thru sieve as upper-bound fbits_in_q = fbits_in_2p + fbits_in_k; // if(fbits_in_q > 64) - // printf("sweep = %llu: fbits_in_q = fbits_in_2p [%10.4f] + fbits_in_k [%10.4f] = %10.4f\n",sweep,fbits_in_2p,fbits_in_k,fbits_in_q); + // printf("sweep = %" PRIu64 ": fbits_in_q = fbits_in_2p [%10.4f] + fbits_in_k [%10.4f] = %10.4f\n",sweep,fbits_in_2p,fbits_in_k,fbits_in_q); /*********************************************/ #if DBG_SIEVE @@ -3083,7 +3083,7 @@ candidate factors that survive sieving. */ if((k <= k_targ) && (k_targ < (k+(sieve_len<<6)))) { itmp64 = k_targ - k; - ASSERT(HERE, itmp64%TF_CLASSES == 0,"(k_targ - k)%TF_CLASSES == 0"); + ASSERT(itmp64%TF_CLASSES == 0,"(k_targ - k)%TF_CLASSES == 0"); itmp64 /= TF_CLASSES; i64_targ = itmp64 >> 6; bit_targ = itmp64 & 63; @@ -3139,7 +3139,7 @@ candidate factors that survive sieving. */ curr_p += (pdiff[m] << 1); if(curr_p > bit_len && !((nprime - m)&63)) { // 2nd clause is to make Loop #2 count a multiple of 64 curr_p -= (pdiff[m] << 1); - ASSERT(HERE, curr_p < p[0],"On Loop 1 exit: curr_p >= p!"); + ASSERT(curr_p < p[0],"On Loop 1 exit: curr_p >= p!"); break; } l = startval[m]; @@ -3225,15 +3225,15 @@ candidate factors that survive sieving. */ #endif #if DBG_SIEVE if(k_targ && l == (i64_targ*64 + bit_targ)) { - fprintf(stderr,"Critical bit being cleared by prime %u, with offset %u\n", curr_p, startval[m]); ASSERT(HERE, 0,"0"); + fprintf(stderr,"Critical bit being cleared by prime %u, with offset %u\n", curr_p, startval[m]); ASSERT(0,"0"); } #endif l += curr_p; } /*...save new startvalue: */ #if DBG_SIEVE - ASSERT(HERE, (startval[m] + startval_incr[m]) < (curr_p + curr_p), "factor.c : (startval[m] + startval_incr[m]) < (curr_p + curr_p)"); - ASSERT(HERE, l-bit_len == (startval[m] + startval_incr[m])%curr_p, "factor.c : l-bit_len == (startval[m] + startval_incr[m])%curr_p"); + ASSERT((startval[m] + startval_incr[m]) < (curr_p + curr_p), "factor.c : (startval[m] + startval_incr[m]) < (curr_p + curr_p)"); + ASSERT(l-bit_len == (startval[m] + startval_incr[m])%curr_p, "factor.c : l-bit_len == (startval[m] + startval_incr[m])%curr_p"); #endif startval[m] = l-bit_len; } @@ -3241,7 +3241,7 @@ candidate factors that survive sieving. */ #endif // USE_AVX512 ? -// if(pass==4)printf("\nPass %u: word0 after deep-prime clearing = %16llX\n",pass,bit_map2[0]); +// if(pass==4)printf("\nPass %u: word0 after deep-prime clearing = %16" PRIX64 "\n",pass,bit_map2[0]); // Now run through the bits of the current copy of the sieve, trial dividing if a bit = 1: #if TF_CLASSES == 60 @@ -3249,7 +3249,7 @@ candidate factors that survive sieving. */ #else ihi = (sieve_len*64)/TF_CLASSES; // 64*sieve_len divisible by TF_CLASSES, no need for padding #endif - ASSERT(HERE, ihi == ((bit_len+63)>>6), "Ihi value-check failed!"); + ASSERT(ihi == ((bit_len+63)>>6), "Ihi value-check failed!"); #ifdef FAC_DEBUG m = 0; // accum popc for(i = 0; i < ihi; i++) { @@ -3262,7 +3262,7 @@ candidate factors that survive sieving. */ #ifdef MULTITHREAD if(tid == 0) #endif - printf("%u [%6.2f%%] survived; count = %llu\n",m,100.*(float)m/bit_len,count); + printf("%u [%6.2f%%] survived; count = %" PRIu64 "\n",m,100.*(float)m/bit_len,count); #endif bit_hi = 64; @@ -3276,13 +3276,13 @@ candidate factors that survive sieving. */ { #ifdef FAC_DEBUG /* If a known factor is specified, here it is in the bitmap: */ - if(ABS((int64)(k-k_targ)) < 1000) printf("Trying k = %llu\n",k); + if(ABS((int64)(k-k_targ)) < 1000) printf("Trying k = %" PRIu64 "\n",k); if(k == k_targ) { printf("here it is: sweep = %s, bitmap word = %u, bit = %3u\n", &cbuf[convert_uint64_base10_char(cbuf, sweep)], i, bit); if((bit_map2[i] >> bit) & 1) - printf("Trying k_targ = %llu...\n", k_targ); + printf("Trying k_targ = %" PRIu64 "...\n", k_targ); else - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif @@ -3301,16 +3301,16 @@ candidate factors that survive sieving. */ */ if((count & countmask) == 0) { - fprintf(stderr,"[k = %llu]",k); + fprintf(stderr,"[k = %" PRIu64 "]",k); #ifdef MULTITHREAD pthread_mutex_lock(&mutex_mi64); // printf("Count = %u * 2^%u checkpoint: Thread %u locked mutex_mi64 ... ",(uint32)(count >> CMASKBITS),CMASKBITS,tid); #endif fp = mlucas_fopen(OFILE,"a"); - ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); + ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even #ifdef FAC_DEBUG - sprintf(cbuf, " Count = %u * 2^%u: k = %llu, Current q = %s\n", + sprintf(cbuf, " Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s\n", (uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]); fprintf(stderr, "%s", cbuf); #endif @@ -3323,9 +3323,9 @@ candidate factors that survive sieving. */ if(MODULUS_TYPE == MODULUS_TYPE_MERSMERS) { res = (mi64_twopmodq_qmmp(findex, k, u64_arr) == 1); if(res != (mi64_twopmodq(p, lenP, k, q, lenQ, q2) == 1) || q2[0] != u64_arr[0]) { - sprintf(cbuf, "ERROR: Spot-check k = %llu, Results of mi64_twopmodq_qmmp and mi64_twopmodq differ!\n", k); + sprintf(cbuf, "ERROR: Spot-check k = %" PRIu64 ", Results of mi64_twopmodq_qmmp and mi64_twopmodq differ!\n", k); fprintf(fp,"%s", cbuf); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } } @@ -3333,18 +3333,18 @@ candidate factors that survive sieving. */ mi64_clear(u64_arr, lenQ); // Use q2 for quotient [i.e. factor-candidate k] and u64_arr for remainder mi64_div(q,two_p,lenQ,lenQ,q2,u64_arr); if(mi64_getlen(q2, lenQ) != 1) { - sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %llu, Current q = %s: k must be 64-bit!\n", + sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s: k must be 64-bit!\n", (uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]); fprintf(fp,"%s", cbuf); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } if(!mi64_cmp_eq_scalar(u64_arr, 1ull, lenQ)) { - sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %llu, Current q = %s: q mod (2p) = %s != 1!\n", + sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s: q mod (2p) = %s != 1!\n", (uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)], &cbuf2[convert_mi64_base10_char(cbuf2, u64_arr, lenQ, 0)]); fprintf(fp,"%s", cbuf); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } /* If q is composite [only check this in debug mode since it costs more than checking @@ -3354,7 +3354,7 @@ candidate factors that survive sieving. */ mi64_sub_scalar(q2,1ull,q2,lenQ); // Re-use q2 to store q-1 if(mi64_twopmodq(q2, lenQ, 0, q, lenQ, 0x0) != 1) { #if SPOT_CHECK - printf(" INFO: Spot-check q with k = %llu is composite\n",k); + printf(" INFO: Spot-check q with k = %" PRIu64 " is composite\n",k); #endif l = 3; for(m = 0; m < nprime; m++) { @@ -3364,21 +3364,21 @@ candidate factors that survive sieving. */ if(mi64_is_div_by_scalar32((uint32 *)q, l, lenQ)) { #ifdef MULTITHREAD // if(tid != 0) break; // Can make thread-specific by fiddling the rhs of the != - printf("Thread %u, k = %llu: q = ",tid,k); - if(lenQ > 1)printf("2^64 * %llu + ",q[1]); - printf("%llu has a small divisor: %u\n",q[0], l); - ASSERT(HERE, 0, "Abort..."); + printf("Thread %u, k = %" PRIu64 ": q = ",tid,k); + if(lenQ > 1)printf("2^64 * %" PRIu64 " + ",q[1]); + printf("%" PRIu64 " has a small divisor: %u\n",q[0], l); + ASSERT(0, "Abort..."); #else - sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %llu, Current q = %s has a small divisor: %u\n", + sprintf(cbuf, "ERROR: Count = %u * 2^%u: k = %" PRIu64 ", Current q = %s has a small divisor: %u\n", (uint32)(count >> CMASKBITS),CMASKBITS,k,&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)],l); fprintf(fp,"%s", cbuf); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); #endif } } } else { #if SPOT_CHECK - printf(" INFO: Spot-check q with k = %llu is base-2 PRP\n",k); + printf(" INFO: Spot-check q with k = %" PRIu64 " is base-2 PRP\n",k); #endif } fclose(fp); fp = 0x0; @@ -3413,35 +3413,35 @@ candidate factors that survive sieving. */ if(k < 1000) { printf("Do deep sieving for k = %u\n",(uint32)k); /****** Apr 2105: This all needs to be made thread-safe ******/ - ASSERT(HERE, 0, "This all needs to be made thread-safe!"); + ASSERT(0, "This all needs to be made thread-safe!"); kdeep[*ndeep++] = (uint32)k; - ASSERT(HERE, *ndeep < 1024, "Increase allocation of kdeep[] array or use deeper sieving bound to reduce #candidate k's!"); + ASSERT(*ndeep < 1024, "Increase allocation of kdeep[] array or use deeper sieving bound to reduce #candidate k's!"); // itmp64 = factor_qmmp_sieve64((uint32)findex, k, MAX_SIEVING_PRIME+2, 0x0001000000000000ull); // if(itmp64) { - // printf("Q( k = %u ) has a small factor: %20llu\n",(uint32)k, itmp64); + // printf("Q( k = %u ) has a small factor: %20" PRIu64 "\n",(uint32)k, itmp64); // } } res = 0; } else { - ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); + ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even res = (mi64_twopmodq_qmmp(findex, k, u64_arr) == 1); // Uncomment to debug by comparing the results of the slow and fast-MMp-optimized modmul routines /* if(res != (mi64_twopmodq(p, lenP, k, q, lenQ, q2) == 1) || q2[0] != u64_arr[0]) { - ASSERT(HERE, 0, "bzzt!"); + ASSERT(0, "bzzt!"); } */ } } else { - ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); + ASSERT(0 == mi64_mul_scalar(two_p,k,q,lenQ), "2.k.p overflows!"); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even res = mi64_twopmodq(p, lenP, k, q, lenQ, u64_arr); } #elif(defined(P4WORD)) - ASSERT(HERE, 0ull == mi64_mul_scalar(two_p,k,(uint64*)&q256,lenQ), "2.k.p overflows!"); + ASSERT(0ull == mi64_mul_scalar(two_p,k,(uint64*)&q256,lenQ), "2.k.p overflows!"); q256.d0 += 1; // No need to check for carry since 2.k.p even p256.d0 = p[0]; p256.d1 = p[1]; p256.d2 = p[2]; p256.d3 = p[3]; t256 = twopmodq256(p256,q256); @@ -3451,12 +3451,12 @@ candidate factors that survive sieving. */ #ifdef USE_FLOAT - ASSERT(HERE, !p[2], "twopmodq200: p[2] nonzero!"); + ASSERT(!p[2], "twopmodq200: p[2] nonzero!"); x256 = twopmodq200_8WORD_qmmp(p,k); res = (uint64)CMPEQ256(x256, ONE256); #else - ASSERT(HERE, 0ull == mi64_mul_scalar(two_p,k,(uint64*)&q192,lenQ), "2.k.p overflows!"); + ASSERT(0ull == mi64_mul_scalar(two_p,k,(uint64*)&q192,lenQ), "2.k.p overflows!"); q192.d0 += 1; // No need to check for carry since 2.k.p even p192.d0 = p[0]; p192.d1 = p[1]; p192.d2 = p[2]; t192 = twopmodq192(p192,q192); @@ -3505,7 +3505,7 @@ candidate factors that survive sieving. */ #endif else { - ASSERT(HERE, fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!"); + ASSERT(fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!"); #if USE_128x96 == 1 /* Use strictly 96-bit routines: */ res = twopmodq96 (p[0],k); @@ -3514,7 +3514,7 @@ candidate factors that survive sieving. */ res = twopmodq128_96(p[0],k); #else /* Use fully 128-bit routines: */ - // ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k,(uint64*)&q128,lenQ), "2.k.p overflows!"); + // ASSERT(0 == mi64_mul_scalar(two_p,k,(uint64*)&q128,lenQ), "2.k.p overflows!"); res = twopmodq128x2(p,k); #endif } @@ -3531,7 +3531,7 @@ candidate factors that survive sieving. */ #error TRYQ = 2 / P3WORD only allowed if USE_FLOAT is defined! #endif /* #ifdef USE_FMADD */ - ASSERT(HERE, !p[2], "twopmodq200: p[2] nonzero!"); + ASSERT(!p[2], "twopmodq200: p[2] nonzero!"); res = twopmodq200_8WORD_qmmp_x2_sse2(p,k_to_try[0],k_to_try[1]); #elif(defined(P1WORD)) @@ -3558,7 +3558,7 @@ candidate factors that survive sieving. */ #ifdef P3WORD - // ASSERT(HERE, !p[2], "twopmodq200: p[2] nonzero!"); + // ASSERT(!p[2], "twopmodq200: p[2] nonzero!"); res = twopmodq192_q4(p,k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]); #elif(defined(P2WORD)) @@ -3609,7 +3609,7 @@ candidate factors that survive sieving. */ res = twopmodq72_q4(p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3]); #endif else { - ASSERT(HERE, fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!"); + ASSERT(fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!"); #if USE_128x96 == 1 /* Use strictly 96-bit routines: */ res = twopmodq96_q4 (p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3], 0,tid); @@ -3670,7 +3670,7 @@ candidate factors that survive sieving. */ res = twopmodq65_q8(p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3],k_to_try[4],k_to_try[5],k_to_try[6],k_to_try[7]); #endif else { - ASSERT(HERE, fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!"); + ASSERT(fbits_in_q < 96, "fbits_in_q exceeds allowable limit of 96!"); #if USE_128x96 == 1 /* Use strictly 96-bit routines: */ res = twopmodq96_q8 (p[0],k_to_try[0],k_to_try[1],k_to_try[2],k_to_try[3],k_to_try[4],k_to_try[5],k_to_try[6],k_to_try[7], 0,tid); @@ -3742,7 +3742,7 @@ candidate factors that survive sieving. */ q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even if(mi64_twopmodq(p, lenP, k_to_try[l], q, lenQ, q2) != 1) { - fprintf(stderr, "ERROR: k = %llu, post-check indicates this does not yield a factor.\n", k_to_try[l]); + fprintf(stderr, "ERROR: k = %" PRIu64 ", post-check indicates this does not yield a factor.\n", k_to_try[l]); // printf("Args sent to mi64_twopmodq:\n"); // printf("p = %s\n", &cbuf[convert_mi64_base10_char(cbuf, p, lenP, 0)]); // printf("q = %s\n", &cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]); @@ -3759,9 +3759,9 @@ candidate factors that survive sieving. */ if(mi64_pprimeF(q, 3ull, lenQ)) { factor_k[(*nfactor)++] = k_to_try[l]; if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) - sprintf(cbuf,"\n\tFactor found: q = %s = 2^(%u+2)*%llu. This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],findex,k_to_try[l]/2); + sprintf(cbuf,"\n\tFactor found: q = %s = 2^(%u+2)*%" PRIu64 ". This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],findex,k_to_try[l]/2); else - sprintf(cbuf,"\n\tFactor found: q = %s = 2*p*k + 1 with k = %llu. This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],k_to_try[l]); + sprintf(cbuf,"\n\tFactor found: q = %s = 2*p*k + 1 with k = %" PRIu64 ". This factor is a probable prime.\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)],k_to_try[l]); #ifdef FAC_DEBUG if(TRYQM1 > 1) printf("factor was number %u of 0-%u in current batch.\n", l, TRYQM1); @@ -3773,7 +3773,7 @@ candidate factors that survive sieving. */ printf("\n\tComposite Factor found: q = %s; checking if any previously-found ones divide it...\n",&cstr[convert_mi64_base10_char(cstr, q, lenQ, 0)]); for(j = 0; j < *nfactor; j++) { q2[lenP] = mi64_mul_scalar( p, 2*factor_k[j], q2, lenP); - ASSERT(HERE, lenP == 1 && q2[lenP] == 0ull, "Unexpected carryout in known-factor computation!"); + ASSERT(lenP == 1 && q2[lenP] == 0ull, "Unexpected carryout in known-factor computation!"); q2[0] += 1; // q2 = 2.k.p + 1; No need to check for carry since 2.k.p even mi64_clear(u64_arr, lenQ); // Use u64_arr for quotient; only care if remainder == 0 or not if(mi64_div(q,q2,lenQ,lenQ,u64_arr,0x0)) { @@ -3786,9 +3786,9 @@ candidate factors that survive sieving. */ to get k2 from k and k1, use k2 = (k - k1)/f1: */ factor_k[*nfactor-1] = (factor_k[*nfactor-1] - factor_k[j])/q2[0]; if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) - sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2^(%u+2)*%llu.\n",findex,factor_k[j]); + sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2^(%u+2)*%" PRIu64 ".\n",findex,factor_k[j]); else - sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2*p*k + 1 with k = %llu.\n",factor_k[j]); + sprintf(cbuf,"\n\tFactor divisible by previously-found factor 2*p*k + 1 with k = %" PRIu64 ".\n",factor_k[j]); } mi64_set_eq(q, u64_arr, lenQ); } @@ -3801,11 +3801,11 @@ candidate factors that survive sieving. */ #ifdef FACTOR_STANDALONE fprintf(stderr,"%s", cbuf); #else - fp = mlucas_fopen(STATFILE,"a"); ASSERT(HERE, fp != 0x0,"0"); + fp = mlucas_fopen(STATFILE,"a"); ASSERT(fp != 0x0,"0"); fprintf(fp,"%s", cbuf); fclose(fp); fp = 0x0; #endif - fp = mlucas_fopen( OFILE,"a"); ASSERT(HERE, fp != 0x0,"0"); + fp = mlucas_fopen( OFILE,"a"); ASSERT(fp != 0x0,"0"); fprintf(fp,"%s", cbuf); fclose(fp); fp = 0x0; #ifdef QUIT_WHEN_FACTOR_FOUND @@ -3840,21 +3840,21 @@ candidate factors that survive sieving. */ itmp64 = mi64_mul_scalar(two_p,k_to_try[l],q,lenQ); // Should only happen benignly, for q just above a wordcount boundary due to padding at high end of current sieve interval // if(itmp64) - // fprintf(stderr,"2.k.p overflows for k = %llu, result = %llu*2^64 + %llu\n",k_to_try[l],itmp64,q[0]); + // fprintf(stderr,"2.k.p overflows for k = %" PRIu64 ", result = %" PRIu64 "*2^64 + %" PRIu64 "\n",k_to_try[l],itmp64,q[0]); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even - // if(k_to_try[0] > 16300000 && k_to_try[0] < 16340000)printf("A: Trying k[%u] = %llu, q = %s\n",l,k_to_try[l],&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]); + // if(k_to_try[0] > 16300000 && k_to_try[0] < 16340000)printf("A: Trying k[%u] = %" PRIu64 ", q = %s\n",l,k_to_try[l],&cbuf[convert_mi64_base10_char(cbuf, q, lenQ, 0)]); #endif #ifdef P4WORD - ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!"); + ASSERT(0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!"); q[0] += 1; // No need to check for carry since 2.k.p even t256 = twopmodq256(*(uint256*)p,*(uint256*)q); res = CMPEQ256(t256, ONE256); #elif(defined(P3WORD)) - ASSERT(HERE, 0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!"); + ASSERT(0 == mi64_mul_scalar(two_p,k_to_try[l],q,lenQ), "2.k.p overflows!"); q[0] += 1; // No need to check for carry since 2.k.p even t192 = twopmodq192(*(uint192*)p,*(uint192*)q); res = CMPEQ192(t192, ONE192); @@ -3887,12 +3887,12 @@ candidate factors that survive sieving. */ #ifdef FACTOR_STANDALONE fprintf(stderr,"%s", cbuf); #else - fp = mlucas_fopen(STATFILE,"a"); ASSERT(HERE, fp != 0x0,"0"); + fp = mlucas_fopen(STATFILE,"a"); ASSERT(fp != 0x0,"0"); fprintf(fp,"%s", cbuf); fclose(fp); fp = 0x0; #endif - fp = mlucas_fopen( OFILE,"a"); ASSERT(HERE, fp != 0x0,"0"); + fp = mlucas_fopen( OFILE,"a"); ASSERT(fp != 0x0,"0"); fprintf(fp,"%s", cbuf); fclose(fp); fp = 0x0; @@ -3920,7 +3920,7 @@ candidate factors that survive sieving. */ // Every 1024th pass, write the checkpoint file, with format as described previously: if(((sweep + 1) %(1024/lenQ + 1)) == 0 || ((sweep + 1) == interval_hi)) { i = write_savefile(RESTARTFILE, pstring, pass, k, count); // Only overwrite passnow, know and count fields of savefile - ASSERT(HERE,!i,"There were errors writing the savefile ... aborting"); + ASSERT(!i,"There were errors writing the savefile ... aborting"); } /* Successfully wrote restart file. */ #endif /* #if !FAC_DEBUG */ } @@ -3956,12 +3956,12 @@ candidate factors that survive sieving. */ if(!fp) { fprintf(stderr,"INFO: factoring savefile %s not found - will create.\n",RESTARTFILE); } else { // If file exists, it should have the proper first 2 lines: - itmp = fscanf(fp,"%s\n",cstr); if(itmp <= 0 || !STREQ(cstr,pstring)) ASSERT(HERE,0,"Line 1 entry found in factoring savefile does not match exponent of run."); - itmp = fscanf(fp,"%u\n",&i ); if(itmp <= 0 || i != TF_PASSES ) ASSERT(HERE,0,"Line 2 entry found in factoring savefile does not match TF_PASSES value of build."); + itmp = fscanf(fp,"%s\n",cstr); if(itmp <= 0 || !STREQ(cstr,pstring)) ASSERT(0,"Line 1 entry found in factoring savefile does not match exponent of run."); + itmp = fscanf(fp,"%u\n",&i ); if(itmp <= 0 || i != TF_PASSES ) ASSERT(0,"Line 2 entry found in factoring savefile does not match TF_PASSES value of build."); } if(!fq) { fprintf(stderr,"INFO: Unable to open factoring savefile %s for reading and/or %s.tmp for writing...quitting.\n",RESTARTFILE,RESTARTFILE); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } curr_line = 0; @@ -3970,14 +3970,14 @@ candidate factors that survive sieving. */ itmp = fprintf(fq,"%s\n",pstring); if(itmp <= 0) { fprintf(stderr,"ERROR: unable to write Line %d (current exponent) to %s.\n", curr_line, TMPFILE); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* TF_PASSES: */ ++curr_line; itmp = fprintf(fq,"%u\n",TF_PASSES); if(itmp <= 0) { fprintf(stderr,"ERROR: unable to write Line %d (TF_PASSES of build) to %s!\n", curr_line, TMPFILE); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } // Now copy any remaining entries in existing file, modifying only the one corr. to the current pass, if it exists: @@ -3986,14 +3986,14 @@ candidate factors that survive sieving. */ if((char_addr = strstr(cstr,"Pass ")) != 0) { itmp = sscanf(char_addr,"%u",i); if(itmp <= 0) { - fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: unable to read [Pass *: k] entry: offending line = [%s]\n",cstr); ASSERT(0,"0"); } if(i == pass) { // Is the pass index the one we are updating? If yes, update the k-value - ASSERT(HERE, !found_pass, "Multiple current-pass entry found in savefile!"); + ASSERT(!found_pass, "Multiple current-pass entry found in savefile!"); found_pass = TRUE; // Calculate the current k-value k = (uint64)incr + (sweep+1)*(sieve_len<<6); - fprintf(fq,"Pass %u: %llu\n",pass,k); + fprintf(fq,"Pass %u: %" PRIu64 "\n",pass,k); } else // Otherwise just copy as-is fputs(cstr,fq); } else { // Just copy as-is @@ -4005,7 +4005,7 @@ candidate factors that survive sieving. */ fclose(fq); fq = 0x0; if(rename(TMPFILE,RESTARTFILE)) { sprintf(cstr,"ERROR: unable to rename %s file ==> %s.\n",TMPFILE,RESTARTFILE); - ASSERT(HERE,0,cstr); + ASSERT(0,cstr); } } // Successfully updated restart file. #endif /* #if !FAC_DEBUG */ @@ -4035,9 +4035,9 @@ candidate factors that survive sieving. */ #ifdef MULTITHREAD pthread_mutex_lock(&mutex_updatecount); - // printf("Thread %u locked mutex_updatecount ... Updating q-tried count: %llu + %llu = ",tid,*(targ->count),count); + // printf("Thread %u locked mutex_updatecount ... Updating q-tried count: %" PRIu64 " + %" PRIu64 " = ",tid,*(targ->count),count); *(targ->count) += count; - // printf("%llu ... Thread %u done.\n",*(targ->count),tid); + // printf("%" PRIu64 " ... Thread %u done.\n",*(targ->count),tid); pthread_mutex_unlock(&mutex_updatecount); return 0x0; #else @@ -4121,7 +4121,7 @@ uint32 CHECK_PKMOD60(uint64*p, uint32 lenP, uint64 k, uint32*incr) return i; } } - ASSERT(HERE, i == 16, "Expect precisely 16 valid k (mod 60) classes!"); + ASSERT(i == 16, "Expect precisely 16 valid k (mod 60) classes!"); //printf("\n"); return i; // Nonzero return value indicates success } @@ -4158,7 +4158,7 @@ uint32 CHECK_PKMOD4620(uint64*p, uint32 lenP, uint64 k, uint32*incr) } else { // Mersenne: For a valid p-mod, the only possible value of km are those for which k == +-1 (mod 8) [by quadratic residuacity] // and for which GCD(2*km*pm + 1, 2*4620) = 1, i.e. (2*km*pm + 1) is not divisible by 3,5,7 or 11. - // printf("CHECK_PKMOD4620: pm,km = %u,%u: q = %llu [mod 8 = %u]\n",pm,km,q,(uint32)q&7); + // printf("CHECK_PKMOD4620: pm,km = %u,%u: q = %" PRIu64 " [mod 8 = %u]\n",pm,km,q,(uint32)q&7); if(((q&7) == 1) || ((q&7) == 7)) { if((q%3 == 0) || (q%5 == 0) || (q%7 == 0) || (q%11 == 0)) return 0; @@ -4189,7 +4189,7 @@ uint32 CHECK_PKMOD4620(uint64*p, uint32 lenP, uint64 k, uint32*incr) return i; } } - ASSERT(HERE, i == 960, "Expect precisely 960 valid k (mod 4620) classes!"); + ASSERT(i == 960, "Expect precisely 960 valid k (mod 4620) classes!"); return i; // Nonzero return value indicates success } @@ -4209,7 +4209,7 @@ uint32 twop_mod_smallp(const int MODULUS_TYPE, const uint64*two_p, const uint32 r += (-((int32)r < 0)) & curr_p; r += r; if(r >= curr_p) { r -= curr_p; } - // ASSERT(HERE, r == mi64_div_y32(two_p, curr_p, 0x0, len2P), "Fast 2p (mod q) for MMp fails!"); + // ASSERT(r == mi64_div_y32(two_p, curr_p, 0x0, len2P), "Fast 2p (mod q) for MMp fails!"); } else { r = mi64_div_y32(two_p, curr_p, 0x0, len2P); } @@ -4238,7 +4238,7 @@ void get_startval( curr_p = p_last_small; for(m = nclear; m < nprime; m++) { - curr_p += (pdiff[m] << 1); ASSERT(HERE, pprimeF(curr_p,2), "Alleged curr_p is Composite!"); + curr_p += (pdiff[m] << 1); ASSERT(pprimeF(curr_p,2), "Alleged curr_p is Composite!"); uint32 twop_mod_currp = twop_mod_smallp(MODULUS_TYPE, two_p, findex, lenQ, curr_p); // This handles both the 1-word and multiword-exponent cases // Special-handling code for p == curr_p case - this is needed to prevent 0-input assertion in the modinv computation below. // Dec 2019: Replaced (p == curr_p) with if() clause which also catches curr_p-divides-exponent for composite exponents: @@ -4323,7 +4323,7 @@ void get_startval( if(interval_lo != 0) { /* bit_len is a uint32, so use i (also a 32-bit) in place of k (64-bit) here: */ i = ceil(1.0*bit_len/curr_p); - ASSERT(HERE, i*curr_p - bit_len == curr_p - (bit_len % curr_p), "i*curr_p - bit_len == curr_p - (bit_len % curr_p)"); + ASSERT(i*curr_p - bit_len == curr_p - (bit_len % curr_p), "i*curr_p - bit_len == curr_p - (bit_len % curr_p)"); /* Now calculate dstartval for the actual current-pass kmin value, according to the number of times we'd need to run through the sieve @@ -4337,10 +4337,10 @@ void get_startval( startval[m] = dstartval; #ifdef FAC_DEBUG - ASSERT(HERE, startval [m] < curr_p, "factor.c : startval [m] < curr_p"); + ASSERT(startval [m] < curr_p, "factor.c : startval [m] < curr_p"); #if DBG_SIEVE startval_incr[m] = i*curr_p - bit_len; - ASSERT(HERE, startval_incr[m] < curr_p, "factor.c : startval_incr[m] < curr_p"); + ASSERT(startval_incr[m] < curr_p, "factor.c : startval_incr[m] < curr_p"); #endif #endif } @@ -4366,7 +4366,7 @@ uint64 given_b_get_k(double bits, const uint64 two_p[], uint32 len) l = i-64; k = (uint64)(pow(2.0, bits-l)/(double)itmp64); // convert_uint64_base2_char(cbuf, itmp64); -// printf("2*p = %16llX has %u bits, lead64 = %s ==> k = %16llu.\n",itmp64,i,cbuf,k); +// printf("2*p = %16" PRIX64 " has %u bits, lead64 = %s ==> k = %16" PRIu64 ".\n",itmp64,i,cbuf,k); #endif return k; } @@ -4440,7 +4440,7 @@ uint64*kmin, uint64*know, uint64*kmax, uint32*passmin, uint32*passnow, uint32*pa char_addr++; tf_passes = convert_base10_char_uint64(char_addr); if(tf_passes != TF_PASSES) { - ++nerr; fprintf(stderr,"ERROR: Line %d of factoring restart file %s: TF_PASSES value [%llu] mismatches that of build [%u]!\n",curr_line,fname, tf_passes, (uint32)TF_PASSES); + ++nerr; fprintf(stderr,"ERROR: Line %d of factoring restart file %s: TF_PASSES value [%" PRIu64 "] mismatches that of build [%u]!\n",curr_line,fname, tf_passes, (uint32)TF_PASSES); } } @@ -4774,7 +4774,7 @@ int write_savefile(const char*fname, const char*pstring, uint32 passnow, uint64 } else if(passnow > passnow_file) { /* No-op */ } else { - ++nerr; fprintf(stderr,"ERROR: In factoring restart file %s: compared to previous checkpoint, passnow[%u] should be same as file[%u] and know[%llu] greater than file[%llu], or passnow should be greater!\n",fname,passnow,passnow_file,know,know_file); + ++nerr; fprintf(stderr,"ERROR: In factoring restart file %s: compared to previous checkpoint, passnow[%u] should be same as file[%u] and know[%" PRIu64 "] greater than file[%" PRIu64 "], or passnow should be greater!\n",fname,passnow,passnow_file,know,know_file); } /* Line 10: passmax: */ diff --git a/src/factor_test.h b/src/factor_test.h index f8d1fb94..9193f13a 100755 --- a/src/factor_test.h +++ b/src/factor_test.h @@ -128,7 +128,7 @@ int test_fac() /* TRYQ: */ #ifndef TRYQ /* This flag is required: */ - ASSERT(HERE, 0,"TRYQ not defined!"); + ASSERT(0,"TRYQ not defined!"); #else i = TRYQ; printf("TRYQ = %u\n", i); @@ -143,7 +143,7 @@ int test_fac() i = THREE_OP128; printf("THREE_OP128 = %u\n", i); /* iF NONZERO, Must = 1 : */ - ASSERT(HERE, (THREE_OP128 == 1),"THREE_OP128 Must = 0 or 1!"); + ASSERT((THREE_OP128 == 1),"THREE_OP128 Must = 0 or 1!"); /* Only relevant for TRYQ = 4 or 8: */ #if(TRYQ != 4 && TRYQ != 8) #error THREE_OP128 Only relevant for TRYQ = 4 or 8! @@ -161,7 +161,7 @@ int test_fac() /* NUM_SIEVING_PRIME: */ #ifndef NUM_SIEVING_PRIME /* This flag is required: */ - ASSERT(HERE, 0,"NUM_SIEVING_PRIME not defined!"); + ASSERT(0,"NUM_SIEVING_PRIME not defined!"); #else i = NUM_SIEVING_PRIME; printf("NUM_SIEVING_PRIME = %u\n", i); @@ -170,7 +170,7 @@ int test_fac() /* TF_CLASSES: */ #ifndef TF_CLASSES /* This flag is required: */ - ASSERT(HERE, 0,"TF_CLASSES not defined!"); + ASSERT(0,"TF_CLASSES not defined!"); #else i = TF_CLASSES; printf("TF_CLASSES = %u\n", i); @@ -253,7 +253,7 @@ int test_fac() #else i = USE_128x96; printf("USE_128x96 = %u\n", i); - ASSERT(HERE,i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n"); + ASSERT(i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n"); /* Only relevant for factoring up to 128 bits: */ #if(defined(P3WORD) || defined(P4WORD)) #warning USE_128x96 Only relevant for factoring up to 128 bits! @@ -270,7 +270,7 @@ int test_fac() #else i = USE_128x96; printf(" USE_128x96 = %u\n", i); - ASSERT(HERE,i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n"); + ASSERT(i <= 2,"Only USE_128x96 = 0-2 are recognized values!\n"); #endif #endif @@ -310,17 +310,17 @@ int test_fac() q = (uint64 *)calloc(l, sizeof(uint64)); q2 = (uint64 *)calloc(l, sizeof(uint64)); mi64_nega(q,q,l); - ASSERT(HERE, mi64_iszero(q,l), "mi64 -0 == 0 check fails!"); + ASSERT(mi64_iszero(q,l), "mi64 -0 == 0 check fails!"); q[0] = 1; mi64_nega(q,q,l); mi64_add_scalar(q,1,q,l); - ASSERT(HERE, mi64_iszero(q,l), "mi64 -1 + 1 == 0 check fails!"); + ASSERT(mi64_iszero(q,l), "mi64 -1 + 1 == 0 check fails!"); // Sep 2015 Bugfix: Hit case with len = 3 and these addends, which give a ripple carry into the top word: q[0] = 6216518070457578443ull; q2[0] = 12230226003251973173ull; q[1] = 16881888488052985758ull; q2[1] = 1564855585656565857ull; q[2] = 65307107850795ull; q2[2] = 2051081684ull; mi64_add(q,q2,q,3); - ASSERT(HERE, q[0] == 0ull && q[1] == 0ull && q[2] == 65309158932480ull, "Sep 2015 mi64_add bugfix test fails!"); + ASSERT(q[0] == 0ull && q[1] == 0ull && q[2] == 65309158932480ull, "Sep 2015 mi64_add bugfix test fails!"); /* Init the RNG: */ rng_isaac_init(TRUE); @@ -332,7 +332,7 @@ int test_fac() mi64_nega(q,q,l); mi64_negl(q2,q2,l); mi64_add_scalar(q2,1,q2,l); - ASSERT(HERE, mi64_cmp_eq(q,q2,l), "mi64 -q == ~q+1 check fails!"); + ASSERT(mi64_cmp_eq(q,q2,l), "mi64 -q == ~q+1 check fails!"); free((void*)q); free((void*)q2); q = q2 = 0x0; @@ -350,7 +350,7 @@ int test_fac() k = 7143819210136784550ull; p64 = 127; p192.d0 = 2294959606785646778ull; p192.d1 = 10167084567166165345ull; p192.d2 = 2688959234133783535ull; mi64_mul_vector_hi_qmmp((uint64*)&p192, p64, k, (uint64*)&q192, 192); - ASSERT(HERE, q192.d0 == 141525868296128525ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qmmp test fails!"); + ASSERT(q192.d0 == 141525868296128525ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qmmp test fails!"); /* 09/30/2015: Adapt above to test Fermat-factor analog of above, mi64_mul_vector_hi_qferm: Ex.: q = 2.k.2^128 + 1; k = 3571909605068392275, i.e. @@ -366,7 +366,7 @@ int test_fac() k = 3571909605068392275ull; p64 = 128; p192.d0 = 2294959606785646778ull; p192.d1 = 10167084567166165345ull; p192.d2 = 2688959234133783535ull; mi64_mul_vector_hi_qferm((uint64*)&p192, p64, k, (uint64*)&q192, 192); - ASSERT(HERE, q192.d0 == 2224217378008898426ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qferm test fails!"); + ASSERT(q192.d0 == 2224217378008898426ull && q192.d1 == 4269430960237156763ull && q192.d2 == 1041345754856384950ull, "mi64_mul_vector_hi_qferm test fails!"); // Apr 2015: mi64_div bug debug - 0-pad both inputs to yield a length-4 mi64 array: // Use 2^256 as a template for our 0-padding, but use 1 less leading 0 because convert_base10_char_mi64 @@ -374,14 +374,14 @@ int test_fac() // 2^256 = 115792089237316195423570985008687907853269984665640564039457584007913129639936: // Feb 2020: Chnages to length-setting logic in convert_base10_char_mi64 mean we must init i,j = 0 prior to calling that function: i = 0; p = convert_base10_char_mi64( "00000000000000000000000000000000000000364131549958466711308970009901738230041", &i); - ASSERT(HERE, mi64_getlen(p, i) == 3 && i == 4,"Bad p-length(s) in Apr2015 mi64_div test!"); + ASSERT(mi64_getlen(p, i) == 3 && i == 4,"Bad p-length(s) in Apr2015 mi64_div test!"); j = 0; q = convert_base10_char_mi64( "00000000000000000000000000000000000000000000000000000000019437941122649628431", &j); - ASSERT(HERE, mi64_getlen(q, j) == 2 && j == 4,"Bad q-length(s) in Apr2015 mi64_div test!"); + ASSERT(mi64_getlen(q, j) == 2 && j == 4,"Bad q-length(s) in Apr2015 mi64_div test!"); q2 = (uint64 *)calloc(4, sizeof(uint64)); // for quotient u64_arr = (uint64 *)calloc(4, sizeof(uint64)); // for remainder mi64_div(p,q,i,i,q2,u64_arr); - ASSERT(HERE, mi64_getlen( q2, i) == 2 && q2[1] == 1 && q2[0] == 286286737571717471ull, "bad quotient!"); - ASSERT(HERE, mi64_getlen(u64_arr, i) == 1 && u64_arr[0] == 618006351061617544ull, "bad remainder!"); + ASSERT(mi64_getlen( q2, i) == 2 && q2[1] == 1 && q2[0] == 286286737571717471ull, "bad quotient!"); + ASSERT(mi64_getlen(u64_arr, i) == 1 && u64_arr[0] == 618006351061617544ull, "bad remainder!"); fprintf(stderr,"Apr2015 mi64_div quicktest passes.\n"); free((void*)p); free((void*)q); free((void*)q2); free((void*)u64_arr); p = 0x0; q = 0x0; q2 = 0x0; u64_arr = 0x0; @@ -391,18 +391,18 @@ int test_fac() two_p = (uint64 *)calloc(i, sizeof(uint64)); mi64_add(p,p,two_p,i); j = 0; q = convert_base10_char_mi64("4969289881134175801642878989330437804491760137935869781219375395913301677808943323410612629818326630668131744420258226244511522022525093242408710254941677603671849301746980479735516135243111", &j); - ASSERT(HERE, i==j,"0"); + ASSERT(i==j,"0"); q2 = (uint64 *)calloc(i, sizeof(uint64)); u64_arr = (uint64 *)calloc(i, sizeof(uint64)); mi64_div(q,two_p,i,i,q2,u64_arr); - ASSERT(HERE, mi64_getlen(q2, i) == 1 , "k must be 64-bit!"); - ASSERT(HERE, q2[0] == 4677965, "k != expected value of 9355930!"); + ASSERT(mi64_getlen(q2, i) == 1 , "k must be 64-bit!"); + ASSERT(q2[0] == 4677965, "k != expected value of 9355930!"); if(!mi64_cmp_eq_scalar(u64_arr, 1ull, i)) { // Remainder = 1 fprintf(stderr,"ERROR : (p, q) = ( %s, %s ) : q mod (2p) = %s != 1!\n", &cbuf0[convert_mi64_base10_char(cbuf0, p, i, 0)], &cbuf1[convert_mi64_base10_char(cbuf1, q, i, 0)], &cbuf2[convert_mi64_base10_char(cbuf2, u64_arr, i, 0)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } else { fprintf(stderr,"mi64_div quicktest passes.\n"); } @@ -423,11 +423,11 @@ int test_fac() p[0] = 1; mi64_shl(p,p,j,i); // 2^n mi64_sub_scalar(p,1,p,i); // p = 2^n - 1; convert_mi64_base10_char(cbuf0, p, i, 0); - ASSERT(HERE, STREQ(cbuf0, "170141183460469231731687303715884105727"), "M127 string-conversion test failed!"); + ASSERT(STREQ(cbuf0, "170141183460469231731687303715884105727"), "M127 string-conversion test failed!"); mi64_set_eq (q, p, i); mi64_sub_scalar(q ,1ull,q ,i); // q = p-1 j = mi64_twopmodq(q, i, 0, p, i, 0x0); - ASSERT(HERE, j == 1, "M127 base-2 PRP test failed!"); + ASSERT(j == 1, "M127 base-2 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-2 PRP test of M127 passed: Time =%s\n",get_time_str(tdiff)); @@ -445,13 +445,13 @@ int test_fac() q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even if(mi64_twopmodq(p, lenP, 56474845800ull, q, lenQ, q2) != 1) { printf("ERROR: res = %s != 1\n", &cbuf[convert_mi64_base10_char(cbuf0, q2, lenQ, 0)]); - ASSERT(HERE, 0, "MM31 known-factor (k = 56474845800) test failed!"); + ASSERT(0, "MM31 known-factor (k = 56474845800) test failed!"); } q[1] = mi64_mul_scalar( p, 2*41448832329225ull, q, lenP); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even if(mi64_twopmodq(p, lenP, 41448832329225ull, q, lenQ, q2) != 1) { printf("ERROR: res = %s != 1\n", &cbuf[convert_mi64_base10_char(cbuf0, q2, lenQ, 0)]); - ASSERT(HERE, 0, "MM31 known-factor (k = 41448832329225) test failed!"); + ASSERT(0, "MM31 known-factor (k = 41448832329225) test failed!"); } free((void*)p); free((void*)q); free((void*)q2); p = q = q2 = 0x0; @@ -463,18 +463,18 @@ int test_fac() p[0] = 1; mi64_shl(p,p,j,i); // 2^n mi64_sub_scalar(p,1,p,i); // p = 2^n - 1; convert_mi64_base10_char(cbuf0, p, i, 0); - ASSERT(HERE, STREQ(cbuf0, "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127"), "M607 string-conversion test failed!"); + ASSERT(STREQ(cbuf0, "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127"), "M607 string-conversion test failed!"); mi64_set_eq (q, p, i); mi64_sub_scalar(q ,1ull,q ,i); // q = p-1 clock1 = clock(); j = mi64_twopmodq(q, i, 0, p, i, 0x0); - ASSERT(HERE, j == 1, "M607 base-2 PRP test failed!"); + ASSERT(j == 1, "M607 base-2 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-2 PRP test of M607 passed: Time =%s\n",get_time_str(tdiff)); // Try the general-base PRP routine on the same number: clock1 = clock(); j = mi64_pprimeF(p, 3, i); - ASSERT(HERE, j == 1, "M607 base-3 PRP test failed!"); + ASSERT(j == 1, "M607 base-3 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-3 PRP test of M607 passed: Time =%s\n",get_time_str(tdiff)); free((void*)p); free((void*)q); p = q = 0x0; @@ -490,13 +490,13 @@ int test_fac() mi64_sub_scalar(q ,1ull,q ,i); // q = p-1 clock1 = clock(); j = mi64_twopmodq(q, i, 0, p, i, 0x0); - ASSERT(HERE, j == 1, "M4423 base-2 PRP test failed!"); + ASSERT(j == 1, "M4423 base-2 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-2 PRP test of M4423 passed: Time =%s\n",get_time_str(tdiff)); // Try the general-base PRP routine on the same number: clock1 = clock(); j = mi64_pprimeF(p, 3, i); - ASSERT(HERE, j == 1, "M4423 base-3 PRP test failed!"); + ASSERT(j == 1, "M4423 base-3 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-3 PRP test of M4423 passed: Time =%s\n",get_time_str(tdiff)); free((void*)p); free((void*)q); p = q = 0x0; @@ -509,18 +509,18 @@ int test_fac() q = (uint64 *)calloc(i, sizeof(uint64)); p[0] = 1; mi64_shl(p,p,j,i); // 2^n mi64_sub_scalar(p,1,p,i); // p = 2^n - 1; next we p /= 458072843161 : -ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/458072843161 divisibility test fails!"); +ASSERT(0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/458072843161 divisibility test fails!"); mi64_set_eq (q, p, i); mi64_sub_scalar(q ,1ull,q ,i); // q = p-1 clock1 = clock(); j = mi64_twopmodq(q, i, 0, p, i, 0x0); - ASSERT(HERE, j == 1, "M7331 cofactor base-2 PRP test failed!"); + ASSERT(j == 1, "M7331 cofactor base-2 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-2 PRP test of M7331 cofactor passed: Time =%s\n",get_time_str(tdiff)); // Try the general-base PRP routine on the same number: clock1 = clock(); j = mi64_pprimeF(p, 3, i); - ASSERT(HERE, j == 1, "M7331 cofactor base-3 PRP test failed!"); + ASSERT(j == 1, "M7331 cofactor base-3 PRP test failed!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-3 PRP test of M7331 cofactor passed: Time =%s\n",get_time_str(tdiff)); free((void*)p); free((void*)q); p = q = 0x0; @@ -538,7 +538,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 mi64_sub_scalar(q ,1ull,q ,i); // q = p-1 clock1 = clock(); j = mi64_twopmodq(q, i, 0, p, i, 0x0); - ASSERT(HERE, j == 1, "M11213 base-2 PRP test failed!"); + ASSERT(j == 1, "M11213 base-2 PRP test failed!"); free((void*)p); free((void*)q); p = q = 0x0; clock2 = clock(); tdiff = (double)(clock2 - clock1); clock1 = clock2; printf ("Base-2 PRP test of M11213 passed: Time =%s\n",get_time_str(tdiff)); @@ -557,12 +557,12 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 q[0] = 1; mi64_shl(q,q,j,i); // 2^607 mi64_sub_scalar(q,1,q,i); // p = 2^607 - 1; // Mul by any scalar < 2^33 should have no carry out of the 10th 64-bit word - ASSERT(HERE, 0 == mi64_mul_scalar(q,2*28115877,q,i), "2.k.M607 (k = 28115877) illegal carryout on scalar-mul!"); + ASSERT(0 == mi64_mul_scalar(q,2*28115877,q,i), "2.k.M607 (k = 28115877) illegal carryout on scalar-mul!"); mi64_set_eq (q2, q, i); // q2 = q-1 mi64_add_scalar(q ,1ull,q ,i); // q = 2.k.p + 1 convert_mi64_base10_char(cbuf0, q, i, 0); - ASSERT(HERE, STREQ(cbuf0, "29866820952126214568806646392159603944715357116119498255498035716027095678819717544056871993402815945328710228895559628455719074056369970920495232704087963394016941839123205985860254232344759"), "q = 2.k.M607+1 (k = 28115877) string-conversion test failed!"); - ASSERT(HERE, mi64_twopmodq(q2, i, 0, q, i, 0x0) == 1, "q = 2.k.M607+1 (k = 28115877) base-2 PRP test failed!"); + ASSERT(STREQ(cbuf0, "29866820952126214568806646392159603944715357116119498255498035716027095678819717544056871993402815945328710228895559628455719074056369970920495232704087963394016941839123205985860254232344759"), "q = 2.k.M607+1 (k = 28115877) string-conversion test failed!"); + ASSERT(mi64_twopmodq(q2, i, 0, q, i, 0x0) == 1, "q = 2.k.M607+1 (k = 28115877) base-2 PRP test failed!"); free((void*)q); free((void*)q2); q = q2 = 0x0; @@ -571,11 +571,11 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 */ // 2nd multiplicand is just leading digits of Pi, sans decimal point: j = 0; q2 = convert_base10_char_mi64("3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233786783165271201909145648566923460348610454326648213393607260249141273724587006606315588174881520920962829254091715364367892590360011330530548820466521", &j); - ASSERT(HERE, j == 20, "vector lengths should be 20!"); + ASSERT(j == 20, "vector lengths should be 20!"); q = (uint64 *)calloc(j, sizeof(uint64)); // output array mi64_mul_vector_hi_qmmp(q2, 1231, 60773088284ull, q, (j<<6)); // q = 2.k.M(p) + 1 with k = 60773088284 convert_mi64_base10_char(cbuf0, q, j, 0); - ASSERT(HERE, STREQ(cbuf0, "678299328487875406787553667584424766193319571425229812042632483796223090743976740829512533956144441574815272835626612961160454952708658437402700559999225654073147100413573556498251710301510338504761109128343850675314104893353603303495634850631971760134667616782442458276408663375682004856646999060481786800862572039635523841600325205075025327991817191734342347965082117753555537"), "mi64_mul_vector_hi_qmmp test failed!"); + ASSERT(STREQ(cbuf0, "678299328487875406787553667584424766193319571425229812042632483796223090743976740829512533956144441574815272835626612961160454952708658437402700559999225654073147100413573556498251710301510338504761109128343850675314104893353603303495634850631971760134667616782442458276408663375682004856646999060481786800862572039635523841600325205075025327991817191734342347965082117753555537"), "mi64_mul_vector_hi_qmmp test failed!"); free((void*)q); free((void*)q2); q = q2 = 0x0; @@ -583,14 +583,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 q192.d2=506560280167ull; q192.d1=18446744073709551615ull; q192.d0=18446743060588991281ull; p192 = q192; p192.d0 -= 1; x192 = twopmodq192(p192,q192); - ASSERT(HERE, CMPEQ192(x192, ONE192),"Bad twopmodq192 output"); + ASSERT(CMPEQ192(x192, ONE192),"Bad twopmodq192 output"); #if 0 /* 12/23/2008: Use this to help debug the mi64 powering routine: */ j = mi64_twopmodq(&p192.d0, 3, 0, &q192.d0, 3, 0x0); if(j != 1) { printf("12/23/2008 mi64_twopmodq Test failed!\n"); - // ASSERT(HERE, j == 1, "mi64_twopmodq != 1"); + // ASSERT(j == 1, "mi64_twopmodq != 1"); // exit(0); } #endif @@ -615,8 +615,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 MULH192_TRUNC(p192,q192,0ull,y192); // Expected value of 64-bit carry layer at top of low-half product = 0 /* Reference value to compare to: */ q192.d2= 11ull; q192.d1= 320947345442520101ull; q192.d0= 2846153632803221902ull; - ASSERT(HERE, CMPEQ192(x192, q192),"MULH192 fails!"); - ASSERT(HERE, CMPEQ192(y192, q192),"MULH192_TRUNC fails!"); + ASSERT(CMPEQ192(x192, q192),"MULH192 fails!"); + ASSERT(CMPEQ192(y192, q192),"MULH192_TRUNC fails!"); /* Count the # of test q's of the various sizes: */ for(ntest63 = 0; fac63 [ntest63 ].p != 0; ++ntest63 ){} @@ -641,23 +641,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 // In the Fermat cae we let 2^n play the role of the Mersenne exponent p and generalize from there. p64 = 1ull << ffac64[i].p; k = ffac64[i].q << 1; // Factors of Fn have form q = k.2^(n+2) + 1; n stored in .p, k in .q q64 = 2*k*p64 + 1; // p64 now stores 2^n - ASSERT(HERE, q64%(p64<<2)==1, "test_fac : q64 % 2^(n+2) != 1 !"); + ASSERT(q64%(p64<<2)==1, "test_fac : q64 % 2^(n+2) != 1 !"); pm60 = p64%60; km60 = k %60; if(!CHECK_PKMOD60(&p64,1, km60, 0x0)) { - fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60); + ASSERT(0,"0"); } pm60 = p64%4620; km60 = k %4620; if(!CHECK_PKMOD4620(&p64,1, km60, 0x0)) { - fprintf(stderr,"Illegal (p,k) mod 4620 pair: p,p mod 4620, k,k mod 4620 = %llu %4u %llu %4u\n",p64,pm60,k,km60); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"Illegal (p,k) mod 4620 pair: p,p mod 4620, k,k mod 4620 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60); + ASSERT(0,"0"); } res64 = twopmodq64(p64, q64); if(res64 != q64-1ull) { // Nov 2021: fiddled twopmodq64() to return true-mod - fprintf(stderr,"ERROR: twopmodq64(F%u, k = %llu) returns non-unity result %u\n",(uint32)ffac64[i].p,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq64(F%u, k = %" PRIu64 ") returns non-unity result %u\n",(uint32)ffac64[i].p,k, (uint32)res64); + ASSERT(0,"0"); } } @@ -678,7 +678,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 fprintf(stderr,"ERROR: twopmodq128(F%u, %s ) returns non-unity result %s\n",(uint32)p64, &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint128_base10_char(cbuf2, res128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -699,7 +699,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 fprintf(stderr,"ERROR: twopmodq192(F%u, %s ) returns non-unity result %s\n",(uint32)p64, &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, res192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -720,7 +720,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 fprintf(stderr,"ERROR: twopmodq256(F%u, %s ) returns non-unity result %s\n",(uint32)p64, &cbuf1[convert_uint256_base10_char(cbuf1, q256)], &cbuf2[convert_uint256_base10_char(cbuf2, res256)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -736,22 +736,22 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 j = ffacBig[i].p; // Fermat index n if(j > 1000) break; // Tune this as desired to skip larger time-consuming cases l = (uint32)ffacBig[i].d1; // Power of 2 appearing in factor q = k*2^l + 1 - ASSERT(HERE, l >= (j+2), "Power of 2 appearing in factor of Fn must be >= [n+2]!"); + ASSERT(l >= (j+2), "Power of 2 appearing in factor of Fn must be >= [n+2]!"); k = ffacBig[i].d0; // Factor k; must be odd in this schema - ASSERT(HERE, 1ull == (k & 1ull), "k must be odd!"); + ASSERT(1ull == (k & 1ull), "k must be odd!"); lenP = (j+63)>>6; // Assume Fermat index increases as we traverse ffacBig array, thus this overwrites previous p[0] = 1ull; p[lenP] = mi64_shl(p,p,j,lenP); lenP += (p[lenP] != 0ull); // case's p = (1 << j) array elements. lenQ = (l+63)>>6; q[0] = k; q[lenQ] = mi64_shl(q,q,l,lenQ); lenQ += (q[lenQ] != 0ull); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even - //printf("Testing F%u, q = %llu * 2^%u + 1, lenQ = %u...\n",j,k,l,lenQ); + //printf("Testing F%u, q = %" PRIu64 " * 2^%u + 1, lenQ = %u...\n",j,k,l,lenQ); uint32 res1 = mi64_twopmodq(p, lenP, k << (l-j-1), q, lenQ, q2); // Fiddle k to put q in Mersenne-like form = 2.k'.2^j + 1 // res1 = mi64_twopmodq_qferm(j, k << (l-j), q2); if(res1 != 1) { - fprintf(stderr,"ERROR: mi64_twopmodq(F%u, q = %llu * 2^%u + 1 = %s) returns non-unity result %s\n",j,k,l, + fprintf(stderr,"ERROR: mi64_twopmodq(F%u, q = %" PRIu64 " * 2^%u + 1 = %s) returns non-unity result %s\n",j,k,l, &cbuf1[convert_mi64_base10_char(cbuf1, q, lenQ, 0)], &cbuf2[convert_mi64_base10_char(cbuf2,q2, lenQ, 0)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -763,8 +763,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 { p64 = fac63[i].p; q64 = fac63[i].q; /* Make sure the MSB = 0: */ - ASSERT(HERE, ( int64)p64 > 0, "test_fac : ( int64)p64 > 0"); - ASSERT(HERE, q64%(2*p64) ==1, "test_fac : q64%(2*p64) ==1"); + ASSERT(( int64)p64 > 0, "test_fac : ( int64)p64 > 0"); + ASSERT(q64%(2*p64) ==1, "test_fac : q64%(2*p64) ==1"); k = (q64-1)/(2*p64); for(j = 0; j < 64; j++) { karr[j] = k; } pm60 = p64%60; km60 = k %60; @@ -776,19 +776,19 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 compositeness test as an exponent filter: */ if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0)) { - fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60); + ASSERT(0,"0"); } if((res64 = twopmodq63(p64, q64)) != 1ull) { - fprintf(stderr,"ERROR: twopmodq63(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq63(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } if((res64 = twopmodq64(p64, q64)) != 1ull) { - fprintf(stderr,"ERROR: twopmodq64(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq64(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FLOAT @@ -796,16 +796,16 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq78_3WORD_DOUBLE(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } /* this is currently sse2/msvc only : p192.d0 = p64; p192.d1 = p192.d2 = 0; x256 = twopmodq200_8WORD_DOUBLE((uint64*)&p192, k); res64 = !x256.d3 && (uint64)CMPEQ192(x256, ONE192); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq200_8WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq200_8WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } */ #endif @@ -813,15 +813,15 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res96 = twopmodq96(p64, k); if(!CMPEQ96(ONE96,res96)) { - fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k, + fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k, &cbuf2[convert_uint96_base10_char(cbuf2, res96)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128_96(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); + fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); } #ifdef USE_FMADD @@ -829,8 +829,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif @@ -840,8 +840,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq78_3WORD_DOUBLE_q2(p64,k,k, 0,0); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #ifdef USE_FMADD @@ -849,23 +849,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE_q2(p64,k,k); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2( %llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2( %" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #elif(TRYQ == 4) res64 = twopmodq63_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq63_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq63_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FLOAT res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #ifdef USE_FMADD @@ -873,57 +873,57 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q4(p64,k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq96_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq128_96_q4( %llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q4( %" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #elif(TRYQ == 8) res64 = twopmodq63_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq63_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq63_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #if defined(USE_FLOAT) && defined(USE_SSE2) && (OS_BITS == 64) res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq96_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq128_96_q8( %llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q8( %" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #elif(TRYQ == 16) #if defined(USE_FLOAT) && defined(USE_AVX)&& defined(COMPILER_TYPE_GCC) && (OS_BITS == 64) res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0); if(res64 != 0xffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #else #error (TRYQ == 16) only supported for 64-bit/P1WORD/GCC/AVX builds! @@ -931,14 +931,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 #elif(TRYQ >= 32) res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0); if(res64 != 0xffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } // #elif(TRYQ == 64) res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0); if(res64 != 0xffffffffffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif } @@ -951,7 +951,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 { p64 = fac64[i].p; q64 = fac64[i].q; - ASSERT(HERE, q64%(2*p64)==1, "test_fac : q64%(2*p64)==1"); + ASSERT(q64%(2*p64)==1, "test_fac : q64%(2*p64)==1"); k = (q64-1)/(2*p64); for(j = 0; j < 64; j++) { karr[j] = k; } pm60 = p64%60; @@ -964,20 +964,20 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 compositeness test as an exponent filter: */ if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0)) { - fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60); + ASSERT(0,"0"); } if(q64%(2*p64) != 1) { - fprintf(stderr,"ERROR : (p, q) = ( %llu, %llu ) : q mod (2p) = %llu != 1!\n",p64,q64, q64%(2*p64)); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR : (p, q) = ( %" PRIu64 ", %" PRIu64 " ) : q mod (2p) = %" PRIu64 " != 1!\n",p64,q64, q64%(2*p64)); + ASSERT(0,"0"); } if((res64 = twopmodq64(p64, q64)) != 1ull) { - fprintf(stderr,"ERROR: twopmodq64(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq64(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FLOAT @@ -985,24 +985,24 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq78_3WORD_DOUBLE(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res96 = twopmodq96(p64, k); if(!CMPEQ96(ONE96,res96)) { - fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k, + fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k, &cbuf2[convert_uint96_base10_char(cbuf2, res96)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128_96(p64,k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FMADD @@ -1010,8 +1010,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE(p64,k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif @@ -1022,8 +1022,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq78_3WORD_DOUBLE_q2(p64, k,k, 0,0); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2( %" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #ifdef USE_FMADD @@ -1031,23 +1031,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE_q2(p64, k,k); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #elif(TRYQ == 4) res64 = twopmodq64_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq64_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq64_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FLOAT res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #ifdef USE_FMADD @@ -1055,56 +1055,56 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q4(p64,k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq128_96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #elif(TRYQ == 8) res64 = twopmodq64_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq64_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq64_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #if defined(USE_FLOAT) && defined(USE_SSE2) && (OS_BITS == 64) res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%llu, k = %llu x 4 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq128_96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #elif(TRYQ == 16) #if defined(USE_FLOAT) && defined(USE_AVX)&& defined(COMPILER_TYPE_GCC) && (OS_BITS == 64) res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0); if(res64 != 0xffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #else #error (TRYQ == 16) only supported for 64-bit/P1WORD/GCC/AVX builds! @@ -1112,14 +1112,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 #elif(TRYQ >= 32) res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0); if(res64 != 0xffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } // #elif(TRYQ == 64) res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0); if(res64 != 0xffffffffffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif } @@ -1134,7 +1134,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 q128.d1 = (uint64)1; q128.d0 = q64; /* Modify this so it'll work with 65-bit q's: */ - ASSERT(HERE, ((q64-1)/2 + 0x8000000000000000ull)%p64==0, "test_fac : ((q64-1)/2 + 0x8000000000000000ull)%p64==0"); + ASSERT(((q64-1)/2 + 0x8000000000000000ull)%p64==0, "test_fac : ((q64-1)/2 + 0x8000000000000000ull)%p64==0"); k = ((q64-1)/2 + 0x8000000000000000ull)/p64; for(j = 0; j < 64; j++) { karr[j] = k; } pm60 = p64%60; @@ -1144,37 +1144,37 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 compositeness test as an exponent filter: */ if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0)) { - fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %llu %4u\n",p64,pm60,k,km60); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %" PRIu64 " %4u\n",p64,pm60,k,km60); + ASSERT(0,"0"); } if((res64 = twopmodq65(p64,k)) != 1) { - fprintf(stderr,"ERROR: twopmodq65(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq65(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FLOAT res64 = twopmodq78_3WORD_DOUBLE(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res96 = twopmodq96(p64, k); if(!CMPEQ96(ONE96,res96)) { - fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k, + fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k, &cbuf2[convert_uint96_base10_char(cbuf2, res96)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128_96(p64,k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FMADD @@ -1182,8 +1182,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif @@ -1193,8 +1193,8 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq78_3WORD_DOUBLE_q2(p64, k,k, 0,0); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #ifdef USE_FMADD @@ -1202,23 +1202,23 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE_q2(p64, k,k); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #elif(TRYQ == 4) res64 = twopmodq65_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq65_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq65_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #ifdef USE_FLOAT res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif #ifdef USE_FMADD @@ -1226,56 +1226,56 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q4(p64,k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq128_96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #elif(TRYQ == 8) res64 = twopmodq65_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq65_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq65_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #if defined(USE_FLOAT) && defined(USE_SSE2) && (OS_BITS == 64) res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%llu, k = %llu x 4 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq128_96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #elif(TRYQ == 16) #if defined(USE_FLOAT) && defined(USE_AVX)&& defined(COMPILER_TYPE_GCC) && (OS_BITS == 64) res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0); if(res64 != 0xffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #else #error (TRYQ == 16) only supported for 64-bit/P1WORD/GCC/AVX builds! @@ -1283,14 +1283,14 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 #elif(TRYQ >= 32) res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0); if(res64 != 0xffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } // #elif(TRYQ == 64) res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0); if(res64 != 0xffffffffffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif } @@ -1310,7 +1310,7 @@ ASSERT(HERE, 0 == mi64_div_by_scalar64(p, 458072843161ull, i, p), "M7331/4580728 &cbuf0[convert_uint64_base10_char (cbuf0, p64)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint64_base10_char (cbuf2, res64)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } q128.d0 += 1ull; @@ -1348,16 +1348,16 @@ if((q128.d1 >> 14) == 0) { dbl = (double)q96.d0 + (double)q96.d1*TWO64FLOAT; rnd = log(dbl)/log(2.0); if(rnd > 77) - printf("p = %10llu, p,k (mod 60) = %2u, %2u, lg(q) = %10.5f\n",p64,pm60,km60,rnd); + printf("p = %10" PRIu64 ", p,k (mod 60) = %2u, %2u, lg(q) = %10.5f\n",p64,pm60,km60,rnd); } */ /* This property only applies for prime exponents, so use a quick base-2 Fermat compositeness test as an exponent filter: */ if(twopmodq64(p64-1, p64) == 1ull && !CHECK_PKMOD60(&p64,1, km60, 0x0)) { - fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %llu %4u %s %4u\n",p64,pm60, + fprintf(stderr,"Illegal (p,k) mod 60 pair: p,p mod 60, k,k mod 60 = %" PRIu64 " %4u %s %4u\n",p64,pm60, &cbuf1[convert_uint128_base10_char(cbuf1, x128)],km60); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* Here use full 96-bit q in both floating and 96-bit modmul, so compute for both: */ @@ -1369,17 +1369,17 @@ if((q128.d1 >> 14) == 0) { if((q96.d1 >> 14) == 0) { /* Integer-truncation-on-store should obviate the need to subtract 1 from q, and (double)q is only accurate to 53 bits to begin with): */ - ASSERT(HERE, x128.d1 == 0, "High half of exactly-computed k nonzero!"); + ASSERT(x128.d1 == 0, "High half of exactly-computed k nonzero!"); dbl = (double)q96.d0 + (double)q96.d1*TWO64FLOAT; dbl /= (2.0*p64); rnd = DNINT(dbl); k = (uint64)rnd; - ASSERT(HERE, x128.d0 == k, "Approx and exactly-computed k differ!"); + ASSERT(x128.d0 == k, "Approx and exactly-computed k differ!"); res64 = twopmodq78_3WORD_DOUBLE(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } #endif @@ -1387,16 +1387,16 @@ if((q128.d1 >> 14) == 0) { res96 = twopmodq96(p64, k); if(!CMPEQ96(ONE96,res96)) { - fprintf(stderr,"ERROR: twopmodq96(%llu, k = %llu) returns non-unity result %s\n",p64,k, + fprintf(stderr,"ERROR: twopmodq96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %s\n",p64,k, &cbuf2[convert_uint96_base10_char(cbuf2, res96)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128_96(p64, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq128_96(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } p128.d0 = p64; p128.d1 = 0; @@ -1407,15 +1407,15 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, p64)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint128_base10_char(cbuf2, res128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } if(x128.d1 == 0) { res64 = twopmodq128x2((uint64 *)&p128, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq128x2(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128x2(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } @@ -1424,8 +1424,8 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq100_2WORD_DOUBLE(p64, q128); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif @@ -1437,8 +1437,8 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq78_3WORD_DOUBLE_q2(p64, k,k, 0,0); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } #endif @@ -1447,8 +1447,8 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq100_2WORD_DOUBLE_q2(p64, k,k); if(res64 != 3) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%llu, k = %llu x 2 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q2(%" PRIu64 ", k = %" PRIu64 " x 2 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif @@ -1462,8 +1462,8 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq78_3WORD_DOUBLE_q4(p64, k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } #endif @@ -1472,21 +1472,21 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq100_2WORD_DOUBLE_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq100_2WORD_DOUBLE_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #endif res64 = twopmodq96_q4(p64,k,k,k,k, 0,0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q4(p64,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq128_96_q4(%llu, k = %llu x 4 ) failed to find factor, res = 0x%1X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q4(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#1X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } // k must be 64-bit @@ -1500,30 +1500,30 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq78_3WORD_DOUBLE_q8(p64, karr, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%llu, k = %llu x 4 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q8(%" PRIu64 ", k = %" PRIu64 " x 4 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } #endif res64 = twopmodq96_q8(p64,k,k,k,k,k,k,k,k, 0,0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_96_q8(p64,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq128_96_q8(%llu, k = %llu x 8 ) failed to find factor, res = 0x%2X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128_96_q8(%" PRIu64 ", k = %" PRIu64 " x 8 ) failed to find factor, res = %#2X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } res64 = twopmodq128_q8((uint64 *)&p128,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint128_base10_char(cbuf0,p128)], &cbuf1[convert_uint128_base10_char(cbuf1,q128)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } // k must be 64-bit @@ -1536,8 +1536,8 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq78_3WORD_DOUBLE_q16(p64 ,karr, 0,0); if(res64 != 0xffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %llu, k = %llu x 16) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q16( %" PRIu64 ", k = %" PRIu64 " x 16) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } #else @@ -1549,8 +1549,8 @@ if((q128.d1 >> 14) == 0) { if((q96.d1 >> 14) == 0) { res64 = twopmodq78_3WORD_DOUBLE_q32(p64 ,karr, 0,0); if(res64 != 0xffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %llu, k = %llu x 32) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q32( %" PRIu64 ", k = %" PRIu64 " x 32) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } } // k must be 52-bit or less @@ -1559,8 +1559,8 @@ if((q128.d1 >> 14) == 0) { if((q96.d1 >> 14) == 0) { res64 = twopmodq78_3WORD_DOUBLE_q64(p64 ,karr, 0,0); if(res64 != 0xffffffffffffffff) { - fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %llu, k = %llu x 64) failed to find factor, res = 0x%4X.\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq78_3WORD_DOUBLE_q64( %" PRIu64 ", k = %" PRIu64 " x 64) failed to find factor, res = %#4X.\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } } } // k must be 52-bit or less @@ -1586,7 +1586,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, p64)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint64_base10_char (cbuf2, res64)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } q128.d0 += 1ull; @@ -1600,7 +1600,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint128_base10_char(cbuf0, p128)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint128_base10_char(cbuf2, x128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* To find the quotient k = (q-1)/(2*p), which may be > 64 bits, use mod-inverse with base 2^128 arithmetic. @@ -1635,7 +1635,7 @@ if((q128.d1 >> 14) == 0) { fprintf(stderr,"ERROR: Illegal (p,k) mod 60 pair: p, p mod 60, q128, k mod 60 = %s %4u %s %4u\n", &cbuf0[convert_uint64_base10_char (cbuf0, p64)], pm60, &cbuf1[convert_uint128_base10_char(cbuf1, q128)], km60); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res128 = twopmodq128(p128, q128); @@ -1645,7 +1645,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, p64)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint128_base10_char(cbuf2, res128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); #if 0 /* 10^31 in binary form - need this to reconstruct large factors that were truncated at 30 digits in the PrimeNet report printout: */ @@ -1773,27 +1773,27 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq128x2((uint64 *)&p128, k); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq128x2(%llu, k = %llu) returns non-unity result %u\n",p64,k, (uint32)res64); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"ERROR: twopmodq128x2(%" PRIu64 ", k = %" PRIu64 ") returns non-unity result %u\n",p64,k, (uint32)res64); + ASSERT(0,"0"); } #if(TRYQ == 4) res64 = twopmodq128_q4((uint64 *)&p128,k,k,k,k); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq128_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq128_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint128_base10_char(cbuf0,p128)], &cbuf1[convert_uint128_base10_char(cbuf1,q128)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #elif(TRYQ == 8) res64 = twopmodq128_q8((uint64 *)&p128,k,k,k,k,k,k,k,k); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n", + fprintf(stderr,"ERROR: twopmodq128_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n", &cbuf0[convert_uint128_base10_char(cbuf0,p128)], &cbuf1[convert_uint128_base10_char(cbuf1,q128)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif } // k must be 64-bit @@ -1840,7 +1840,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)], &cbuf2[convert_uint128_base10_char(cbuf2, q128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128x2B((uint64*)&p128, q128); @@ -1850,7 +1850,7 @@ if((q128.d1 >> 14) == 0) { i,i2, fac63[i].p, fac64[i2].p, &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } } @@ -1893,7 +1893,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, fac64[i].q)], &cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)], &cbuf2[convert_uint128_base10_char(cbuf2, q128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128x2B((uint64*)&p128, q128); @@ -1903,7 +1903,7 @@ if((q128.d1 >> 14) == 0) { i,i2, fac64[i].p, fac64[i2].p, &cbuf0[convert_uint64_base10_char (cbuf0, fac64[i].q)], &cbuf1[convert_uint64_base10_char (cbuf1, fac64[i2].q)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } } @@ -1943,7 +1943,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint128_base10_char(cbuf1, x128)]); fprintf(stderr," q128.d1 += fac63[i].q overflows!\n"); - ASSERT(HERE, q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q"); /* Make sure sum didn't overflow */ + ASSERT(q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q"); /* Make sure sum didn't overflow */ } /* Skip the q%(2*p) == 1 and (p%60,q%60) checks, as they don't apply @@ -1959,7 +1959,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint128_base10_char(cbuf1, x128)], &cbuf2[convert_uint128_base10_char(cbuf2, res128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res64 = twopmodq128x2B((uint64*)&p128, q128); @@ -1969,7 +1969,7 @@ if((q128.d1 >> 14) == 0) { i,i2, fac63[i].p, fac65[i2].p, &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint128_base10_char(cbuf1, x128)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } } @@ -1995,7 +1995,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint128_base10_char(cbuf0, p128)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint128_base10_char(cbuf2, x128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res128 = twopmodq128(p128, q128); @@ -2005,7 +2005,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint128_base10_char(cbuf0, p128)], &cbuf1[convert_uint128_base10_char(cbuf1, q128)], &cbuf2[convert_uint128_base10_char(cbuf2, res128)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -2031,18 +2031,18 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, x192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } // Now compute k = (q-1)/2p, while verifying that q%2p = 1: mi64_div((uint64*)&q192, (uint64*)&two_p192, 3,3, (uint64*)&x192, (uint64*)&res192); // x192 contains k - ASSERT(HERE, x192.d2 == 0 && x192.d1 == 0,"k > 2^64!"); + ASSERT(x192.d2 == 0 && x192.d1 == 0,"k > 2^64!"); if(!CMPEQ192(res192, ONE192)) { fprintf(stderr,"ERROR: twopmodq192( %s, %s ) returns non-unity result!\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */ @@ -2051,38 +2051,38 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq160_q4(p192,q192,q192,q192,q192); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } */ res64 = twopmodq192_q4((uint64*)&p192,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); // *** disable this to allow fast-UMULH192 timing-testing *** + ASSERT(0,"0"); // *** disable this to allow fast-UMULH192 timing-testing *** } #elif(TRYQ == 8) /* res64 = twopmodq160_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n", + fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } */ res64 = twopmodq192_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n", + fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif } @@ -2106,7 +2106,7 @@ if((q128.d1 >> 14) == 0) { fprintf(stderr,"ERROR: q != 1 modulo p for M( %s ), q = %s \n", &cbuf0[convert_uint64_base10_char (cbuf0, p64)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } // Now compute k = (q-1)/2p, while verifying that q%2p = 1: @@ -2117,7 +2117,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, res192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* res192 = twopmodq160(p192, q192); @@ -2127,7 +2127,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, res192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } */ res192 = twopmodq192(p192, q192); @@ -2137,7 +2137,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, res192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */ @@ -2148,19 +2148,19 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq160_q4(p192,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq160_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } */ res64 = twopmodq192_q4((uint64*)&p192,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } #elif(TRYQ == 8) @@ -2170,19 +2170,19 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq160_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n", + fprintf(stderr,"ERROR: twopmodq160_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } */ res64 = twopmodq192_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n", + fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } #endif @@ -2212,7 +2212,7 @@ if((q128.d1 >> 14) == 0) { fprintf(stderr,"ERROR: q != 1 modulo p for M( %s ), q = %s \n", &cbuf0[convert_uint64_base10_char (cbuf0, p64)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } // Now compute k = (q-1)/2p, while verifying that q%2p = 1: @@ -2223,7 +2223,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, res192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res192 = twopmodq192(p192, q192); @@ -2233,7 +2233,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], &cbuf2[convert_uint192_base10_char(cbuf2, res192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* this is currently sse2/msvc only : @@ -2242,10 +2242,10 @@ if((q128.d1 >> 14) == 0) { x256 = twopmodq200_8WORD_DOUBLE((uint64*)&p192, x192.d0); res64 = !x256.d3 && (uint64)CMPEQ192(x256, ONE192); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %llu\n", + fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %" PRIu64 "\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } */ @@ -2263,7 +2263,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint256_base10_char(cbuf0, p256)], &cbuf1[convert_uint256_base10_char(cbuf1, q256)], &cbuf2[convert_uint256_base10_char(cbuf2, x256)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } res256 = twopmodq256(p256, q256); @@ -2273,7 +2273,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint256_base10_char(cbuf0, p256)], &cbuf1[convert_uint256_base10_char(cbuf1, q256)], &cbuf2[convert_uint256_base10_char(cbuf2, res256)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */ @@ -2283,10 +2283,10 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq192_q4((uint64*)&p192,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 15) { - fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } #elif(TRYQ == 8) @@ -2295,10 +2295,10 @@ if((q128.d1 >> 14) == 0) { res64 = twopmodq192_q8(p192,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0,x192.d0); if(res64 != 255) { - fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = 0x%2X.\n", + fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 8 ) failed to find factor, res = %#2X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1, q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } #endif @@ -2342,7 +2342,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint128_base10_char(cbuf1, x128)]); fprintf(stderr," q128.d1 += fac63[i].q overflows!\n"); - ASSERT(HERE, q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q"); /* Make sure sum didn't overflow */ + ASSERT(q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q"); /* Make sure sum didn't overflow */ } /* Now multiply the 128-bit 63x65-bit factor product by each 64-bit test factor in turn. */ @@ -2384,7 +2384,7 @@ if((q128.d1 >> 14) == 0) { &cbuf2[convert_uint128_base10_char(cbuf2, x128)], &cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)], &cbuf4[convert_uint192_base10_char(cbuf4, q192)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } p256.d0 = p192.d0; q256.d0 = q192.d0; @@ -2401,7 +2401,7 @@ if((q128.d1 >> 14) == 0) { &cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)], &cbuf4[convert_uint256_base10_char(cbuf4, q256)], &cbuf5[convert_uint256_base10_char(cbuf5, res256)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } /* In debug mode, also test the multiple-q versions of the modular exponentiation routines: */ @@ -2413,10 +2413,10 @@ if((q128.d1 >> 14) == 0) { &cbuf1[convert_uint64_base10_char (cbuf1, fac63[i].q)], &cbuf2[convert_uint128_base10_char(cbuf2, x128)], &cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)]); - fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq192_q4( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1,q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #elif(TRYQ == 8) res64 = twopmodq192_q8(p192,q192,q192,q192,q192,q192,q192,q192,q192); @@ -2426,10 +2426,10 @@ if((q128.d1 >> 14) == 0) { &cbuf1[convert_uint64_base10_char (cbuf1, fac63[i].q)], &cbuf2[convert_uint128_base10_char(cbuf2, x128)], &cbuf3[convert_uint64_base10_char (cbuf3, fac64[i3].q)]); - fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 4 ) failed to find factor, res = 0x%1X.\n", + fprintf(stderr,"ERROR: twopmodq192_q8( %s, %s x 4 ) failed to find factor, res = %#1X.\n", &cbuf0[convert_uint192_base10_char(cbuf0, p192)], &cbuf1[convert_uint192_base10_char(cbuf1,q192)], (uint32)res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #endif } @@ -2451,7 +2451,7 @@ if((q128.d1 >> 14) == 0) { { p256 = convert_base10_char_uint256(fac256[i].p); ADD256(p256,p256,two_p256); q256 = convert_base10_char_uint256(fac256[i].q); - ASSERT(HERE, CMPEQ256(xmody256(q256, two_p256, &x256), ONE256), "ERROR: q%(2p) != 1"); + ASSERT(CMPEQ256(xmody256(q256, two_p256, &x256), ONE256), "ERROR: q%(2p) != 1"); res256 = twopmodq256(p256, q256); if(!CMPEQ256(res256, ONE256)) { @@ -2459,7 +2459,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint256_base10_char(cbuf0, p256)], &cbuf1[convert_uint256_base10_char(cbuf1, q256)], &cbuf2[convert_uint256_base10_char(cbuf2, res256)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } #if 0 /************* need to use k-based for FP200! **********/ /* this is currently sse2/msvc only : @@ -2468,14 +2468,14 @@ if((q128.d1 >> 14) == 0) { { p128.d0 = p192.d0; p128.d1 = p192.d1; - printf("twopmodq200, p = %s, k = %llu\n", fac256->p, x256.d0); + printf("twopmodq200, p = %s, k = %" PRIu64 "\n", fac256->p, x256.d0); x256 = twopmodq200_8WORD_DOUBLE(p128, x256.d0); res64 = !x256.d3 && (uint64)CMPEQ192(x256, ONE192); if(res64 != 1) { - fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %llu\n", + fprintf(stderr,"ERROR: twopmodq200( %s, %s ) returns non-unity result %" PRIu64 "\n", &cbuf0[convert_uint256_base10_char(cbuf0, p256)], &cbuf1[convert_uint256_base10_char(cbuf1, q256)], res64); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } */ @@ -2518,7 +2518,7 @@ if((q128.d1 >> 14) == 0) { &cbuf0[convert_uint64_base10_char (cbuf0, fac63[i].q)], &cbuf1[convert_uint128_base10_char(cbuf1, x128)]); fprintf(stderr," q128.d1 += fac63[i].q overflows!\n"); - ASSERT(HERE, q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q"); /* Make sure sum didn't overflow */ + ASSERT(q128.d1 > fac63[i].q,"q128.d1 > fac63[i].q"); /* Make sure sum didn't overflow */ } /* Now multiply the 128-bit 63x65-bit factor product by the product of each pair of 64-bit test factors in turn. */ @@ -2555,7 +2555,7 @@ if((q128.d1 >> 14) == 0) { &cbuf4[convert_uint64_base10_char (cbuf4, fac64[jj].q)], &cbuf5[convert_uint256_base10_char(cbuf5, q256)], &cbuf6[convert_uint256_base10_char(cbuf6, res256)]); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } } diff --git a/src/fermat_mod_square.c b/src/fermat_mod_square.c index fe6d3f32..1cae6c6a 100644 --- a/src/fermat_mod_square.c +++ b/src/fermat_mod_square.c @@ -241,7 +241,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui // v20: got rid of 1st constraint, so we can use a single mode_flag value in p-1 stage 2 for both vecs we want to fwd-FFT-only // but input in fwd-FFT-pass-1-already-done mode and ones where we do both FFTs, input in said form and left so on return: // if(fwd_fft == 1ull) - // ASSERT(HERE, mode_flag < 2, "Only low bit of mode_flag field may be used in this case!"); + // ASSERT(mode_flag < 2, "Only low bit of mode_flag field may be used in this case!"); } /* These came about as a result of multithreading, but now are needed whether built unthreaded or multithreaded */ @@ -266,12 +266,12 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui #endif #ifdef USE_IMCI512 // 1st-gen Xeon Phi - Use modified 8x8 doubles-transpose algo [1a] from util.c:test_simd_transpose_8x8() - ASSERT(HERE,0,"Fermat-mod unsupported in k1om / IMCI-512 build mode!"); + ASSERT(0,"Fermat-mod unsupported in k1om / IMCI-512 build mode!"); exit(1); #endif radix0 = RADIX_VEC[0]; nchunks = radix0; - ASSERT(HERE, TRANSFORM_TYPE == RIGHT_ANGLE, "fermat_mod_square: Incorrect TRANSFORM_TYPE!"); + ASSERT(TRANSFORM_TYPE == RIGHT_ANGLE, "fermat_mod_square: Incorrect TRANSFORM_TYPE!"); /*...initialize things upon first entry */ @@ -296,26 +296,26 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui N2 =n/2; /* Complex vector length. */ if(!arr_scratch) { sprintf(cbuf, "Init portion of %s requires non-null scratch array!",func); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } for(i = 0; i < NRADICES; i++) { if(RADIX_VEC[i] == 0) { sprintf(cbuf, "%s: RADIX_VEC[i = %d] zero, for i < [NRADICES = %d]!",func,i,NRADICES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } radix_set_save[i] = RADIX_VEC[i]; } for(i = NRADICES; i < 10; i++) { if(RADIX_VEC[i] != 0) { sprintf(cbuf, "%s: RADIX_VEC[i = %d] nonzero, for i >= [NRADICES = %d]!",func,i,NRADICES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } radix_set_save[i] = 0; } /*...Check that the binary exponent corresponds to a proper Fermat index: */ findex = trailz64(p); - ASSERT(HERE, p >> findex == 1,"fermat_mod_square.c: p >> findex == 1"); + ASSERT(p >> findex == 1,"fermat_mod_square.c: p >> findex == 1"); // Set function pointers for DIF|DIT pass1: dif1_dit1_func_name( radix0, &func_dif1, &func_dit1 ); @@ -323,18 +323,18 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui /* My array padding scheme requires N/radix0 to be a power of 2, and to be >= 2^DAT_BITS, where the latter parameter is set in the Mdata.h file: */ if(n%radix0 != 0) { - sprintf(cbuf ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Make sure n/radix0 is a power of 2: */ i = n/radix0; if((i >> trailz32(i)) != 1) { - sprintf(cbuf ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } if(DAT_BITS < 31) { /* Now make sure n/radix0 is sufficiently large (unless n < 2^DAT_BITS, in which case it doesn't matter): */ if(i < (1 << DAT_BITS)) { - // sprintf(cbuf ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS)); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + // sprintf(cbuf ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS)); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); // Mar 2018: Switch to 'soft' assertion error here, e.g. for timing tests at small FFT lengths: sprintf(cbuf ,"n/radix0 must be >= %u! Skipping this radix combo.\n", (1 << DAT_BITS)); WARN(HERE, cbuf, "", 1); return(ERR_ASSERT); } @@ -342,7 +342,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if((1 << DAT_BITS) < 2*RADIX_VEC[NRADICES-1]) { sprintf(cbuf ,"ERROR: Value of DAT_BITS means final FFT radix may not exceed = %u!\n", (1 << (DAT_BITS-1))); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } @@ -374,7 +374,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(mm*RADIX_VEC[NRADICES-1] != N2) { sprintf(cbuf ,"ERROR: product of radices not equal to complex vector length\n"); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* index_ptmp = ALLOC_INT(index_ptmp, k); <*** Jan 2020: Started getting this error here, NFC as to why: malloc: *** error for object 0x100802608: incorrect checksum for freed object - object was probably modified after being freed. @@ -383,7 +383,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui { sprintf(cbuf ,"ERROR: unable to allocate array INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } index = ALIGN_INT(index_ptmp); */ @@ -536,7 +536,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui default: sprintf(cbuf ,"ERROR: radix %d not available for Fermat-mod transform. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } for(i = 1; i < NRADICES; i++) @@ -576,7 +576,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui default: sprintf(cbuf ,"ERROR: intermediate radix %d not available. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* Final radix must be 16 or 32: */ @@ -584,7 +584,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui { sprintf(cbuf ,"ERROR: final radix %d not available. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } nradices_prim = l; for( ; l < 30; l++) { radix_prim[l] = 0; } // Zero any higher elements which may have been previously set due @@ -620,8 +620,8 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui baseinv[0] = (double)(1.0/base[0] ); baseinv[1] = (double)(1.0/base[1]); /* don't need extended precision for this since both bases are powers of 2. */ /*...stuff for the reduced-length DWT weights arrays is here: */ - wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt); if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp); - wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, nwt); if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp); + wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt); if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp); + wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, nwt); if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp); /******************************************************************/ /* Crandall/Fagin weighting factors and number of bits per digit. */ @@ -629,7 +629,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui /* Double-check that sw*nwt (where nwt is the odd factor of N) is divisible by N: */ // printf("sw,nwt,n = %u,%u,%u; sw*nwt mod n = %u\n",sw,nwt,n, (uint64)sw*nwt % n); - ASSERT(HERE, (uint64)sw*nwt % n == 0,"fermat_mod_square.c: sw*nwt % n == 0"); + ASSERT((uint64)sw*nwt % n == 0,"fermat_mod_square.c: sw*nwt % n == 0"); SW_DIV_N = sw*nwt/n; qn = i64_to_q((int64) nwt); @@ -652,7 +652,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QWT = %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } for(i = 0; i < nwt; i++) @@ -672,7 +672,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QWT0 = %20.15f, DWT0 = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Inverse DWT weight factor: */ @@ -692,7 +692,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QWT1 = %20.15f, DWT1 = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qwt= qfmul(qwt, qmul); } @@ -705,14 +705,14 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui /* No need for a fancy NINT here: */ NRT_BITS = (uint32)(log(sqrt(1.0*n))/log(2.0) + 0.5); NRT = 1 << NRT_BITS; - if(n%NRT){ sprintf(cbuf,"ERROR: NRT does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(n%NRT){ sprintf(cbuf,"ERROR: NRT does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } NRTM1 = NRT - 1; /*...The rt0 array stores the (0:NRT-1)th powers of the [N2]th root of unity (i.e. will be accessed using the lower (NRT) bits of the integer sincos index): */ rt0_ptmp = ALLOC_COMPLEX(rt0_ptmp, NRT); - if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0 = ALIGN_COMPLEX(rt0_ptmp); qt = i64_to_q((int64)N2); @@ -736,7 +736,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QCOS1= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } t1 = qfdbl(qi); @@ -753,7 +753,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QSIN1= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qt = QZRO; @@ -775,7 +775,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0[i].re = t1; @@ -794,7 +794,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0[i].im = t1; @@ -811,7 +811,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui (and will be accessed using the upper bits, , of the integer sincos index): */ rt1_ptmp = ALLOC_COMPLEX(rt1_ptmp, n/(2*NRT)); - if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1 = ALIGN_COMPLEX(rt1_ptmp); qn = i64_to_q((int64)NRT); @@ -837,7 +837,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QCOS2= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } t1 = qfdbl(qi); @@ -854,7 +854,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QSIN2= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qt = QZRO; @@ -876,7 +876,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1[i].re = t1; @@ -895,7 +895,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1[i].im = t1; @@ -916,7 +916,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui /*...The rn0 array stores the (0:NRT-1)th powers of the [2*n]th root of unity (i.e. will be accessed using the lower (NRT) bits of the integer sincos index): */ - rn0_ptmp = ALLOC_COMPLEX(rn0_ptmp, NRT); if(!rn0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } rn0 = ALIGN_COMPLEX(rn0_ptmp); + rn0_ptmp = ALLOC_COMPLEX(rn0_ptmp, NRT); if(!rn0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rn0 = ALIGN_COMPLEX(rn0_ptmp); qt = i64_to_q((int64)N2); qtheta = qfdiv(QPIHALF, qt); /* (2*pi)/(2*N) = (pi/2)/(N/2) */ @@ -938,7 +938,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QCOS1= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } t1 = qfdbl(qi); @@ -954,7 +954,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QSIN1= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qt = QZRO; @@ -976,7 +976,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rn0[i].re = t1; @@ -994,7 +994,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rn0[i].im = t1; @@ -1010,7 +1010,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui /*...The rn1 array stores the (0:(n/2)/NRT-1)th powers of the [(n/2)/NRT]th root of unity (and will be accessed using the upper bits, , of the integer sincos index): */ - rn1_ptmp = ALLOC_COMPLEX(rn1_ptmp, N2/NRT); if(!rn1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } rn1 = ALIGN_COMPLEX(rn1_ptmp); + rn1_ptmp = ALLOC_COMPLEX(rn1_ptmp, N2/NRT); if(!rn1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RN1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rn1 = ALIGN_COMPLEX(rn1_ptmp); qn = i64_to_q((int64)NRT); qt = i64_to_q((int64)N2); @@ -1034,7 +1034,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QCOS2= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } t1 = qfdbl(qi); @@ -1050,7 +1050,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QSIN2= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qt = QZRO; @@ -1072,7 +1072,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rn1[i].re = t1; @@ -1090,7 +1090,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rn1[i].im = t1; @@ -1108,7 +1108,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui fprintf(stderr, "%s:\n",func); fprintf(stderr, " Max abs error between real*8 and real*16 computed values = %20.15f\n", max_adiff); fprintf(stderr, " Max bit error between real*8 and real*16 computed values = %20.0f \n", (double)max_idiff); - ASSERT(HERE, (max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting."); + ASSERT((max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting."); } #ifdef MULTITHREAD @@ -1141,13 +1141,13 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui } // Threadpool-based dispatch: - ASSERT(HERE, MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!"); + ASSERT(MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!"); if(radix0 % NTHREADS != 0) fprintf(stderr,"%s: radix0 not exactly divisible by NTHREADS - This will hurt performance.\n",func); main_work_units = 0; pool_work_units = radix0; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("%s: Init threadpool of %d threads\n",func,NTHREADS); #endif // MULTITHREAD? @@ -1192,11 +1192,11 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui // v20: Add support for mod_mul with one input being in precomputed fwd-FFTed form: #ifdef MULTITHREAD for(i = 0; i < nchunks; ++i) { tdat[i].arrdat = a; tdat[i].fwd_fft = fwd_fft; tdat[i].c = c; } -// printf("Thread 0: arrdat = 0x%llX, fwd_fft = 0x%llX\n",tdat[0].arrdat,tdat[0].fwd_fft); +// printf("Thread 0: arrdat = %#" PRIX64 ", fwd_fft = %#" PRIX64 "\n",tdat[0].arrdat,tdat[0].fwd_fft); #endif /*...Init clock counter: */ - ASSERT(HERE, tdiff != 0,"fermat_mod_square.c: tdiff != 0"); + ASSERT(tdiff != 0,"fermat_mod_square.c: tdiff != 0"); #ifdef CTIME clock1 = clock(); @@ -1216,7 +1216,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui goto undo_initial_ffft_pass; if((mode_flag & 1) == 0) { - // fprintf(stderr,"Array = 0x%llX, Iter = %u, Fwd-WT: mode_flag = 0x%X, ilo = %u, a[1] = %18.10f\n",(uint64)a,ilo+1,mode_flag,ilo,a[1]); + // fprintf(stderr,"Array = %#" PRIX64 ", Iter = %u, Fwd-WT: mode_flag = %#X, ilo = %u, a[1] = %18.10f\n",(uint64)a,ilo+1,mode_flag,ilo,a[1]); // Mar 2017: Can skip this step if it's the start of a production test (note that any initial-residue shift // in such cases is handled via single-array-word forward-DWT-weighting in the Mlucas.c shift_word() function), // but need it if add RNG-input-setting above for debug, hence also check a[1] for nonzero: @@ -1236,14 +1236,14 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui j1 = j; #endif j1 = j1 + ( (j1>> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */ - ASSERT(HERE, DNINT(a[j1]) == a[j1],"fermat_mod_square.c: Input a[j] noninteger!"); + ASSERT(DNINT(a[j1]) == a[j1],"fermat_mod_square.c: Input a[j] noninteger!"); wt = wt0[ii]; a[j1] *= wt; ii += SW_DIV_N - nwt; ii += ( (-(int)((uint32)ii >> 31)) & nwt); } /* Odds: */ - ASSERT(HERE, ii == 0,"fermat_mod_square.c: ii == 0"); + ASSERT(ii == 0,"fermat_mod_square.c: ii == 0"); for(j = 0; j < n; j += 2) { #ifdef USE_AVX512 @@ -1257,7 +1257,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui #endif j1 = j1 + ( (j1>> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */ j2 = j1 + RE_IM_STRIDE; - ASSERT(HERE, DNINT(a[j2]) == a[j2],"fermat_mod_square.c: Input a[j] noninteger!"); + ASSERT(DNINT(a[j2]) == a[j2],"fermat_mod_square.c: Input a[j] noninteger!"); wt = wt0[ii]; a[j2] *= wt; ii += SW_DIV_N - nwt; @@ -1310,7 +1310,7 @@ int fermat_mod_square(double a[], int arr_scratch[], int n, int ilo, int ihi, ui */ ierr = 0; /* Any return-value error code (whether fatal or not) stored here */ - ASSERT(HERE, ihi > ilo,"ferm_mod_square.c: ihi <= ilo!"); + ASSERT(ihi > ilo,"ferm_mod_square.c: ihi <= ilo!"); #if DBG_THREADS fprintf(stderr,"%s: NTHREADS = %3d\n",func,NTHREADS); @@ -1354,7 +1354,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) while(tpool->free_tasks_queue.num_tasks != pool_work_units) { // sleep(1); //*** too granular *** // Finer-resolution, declared in ; cf. http://linux.die.net/man/2/nanosleep - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!"); // printf("sleep; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); } // printf("end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1389,7 +1389,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) MOD_ADD64(RES_SHIFT,RES_SHIFT,p,RES_SHIFT); RES_SHIFT += ((BASE_MULTIPLIER_BITS[i>>6] >> (i&63)) & 1); // No mod needed on this add, since result of pvs line even and < p, which is itself even in the Fermat-mod case (p = 2^m) const char flip[2] = {' ','*'}; -// printf("Iter %d: shift = [%c]%llu\n",iter,flip[RES_SIGN],RES_SHIFT); +// printf("Iter %d: shift = [%c]%" PRIu64 "\n",iter,flip[RES_SIGN],RES_SHIFT); #endif } /*...Do the final inverse FFT pass, carry propagation and initial forward FFT pass in one fell swoop, er, swell loop... */ @@ -1473,7 +1473,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) ierr = radix4096_ditN_cy_dif1 (a,n,nwt,nwt_bits,wt0,wt1,0x0,rn0,rn1,base,baseinv,iter,&fracmax,p); break; */ default: - sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } // v19: Nonzero exit carries used to be fatal, added retry-from-last-savefile handling for these @@ -1566,12 +1566,12 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) // On early-exit-due-to-interrupt, decrement iter since we didn't actually do the (iter)th iteration if(!MLUCAS_KEEP_RUNNING) iter--; if(iter < ihi) { - ASSERT(HERE, !MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!"); + ASSERT(!MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!"); ierr = ERR_INTERRUPT; ROE_ITER = iter; // Function return value used for error code, so save number of last-iteration-completed-before-interrupt here -// fprintf(stderr,"Caught signal at iter = %u; mode_flag = 0x%X\n",iter,mode_flag); +// fprintf(stderr,"Caught signal at iter = %u; mode_flag = %#X\n",iter,mode_flag); mode_flag &= 0xfffffffd; // v20: In case of interrupt-exit override any mode_flag "skip undo of initial DIF pass" setting -// fprintf(stderr,"After ^2-toggle, mode_flag = 0x%X, (mode_flag >> 1) = 0x%X\n",mode_flag,mode_flag>>1); +// fprintf(stderr,"After ^2-toggle, mode_flag = %#X, (mode_flag >> 1) = %#X\n",mode_flag,mode_flag>>1); } #ifdef RTIME @@ -1600,10 +1600,10 @@ if(iter < ihi) { // v20: Add support for fwd_fft_only|mode_flag as described in top-of-function comments undo_initial_ffft_pass: -// printf("Iter %u: ierr = %u, fwd_fft = %llu, mode_flag = %u\n",iter,ierr,fwd_fft,mode_flag); +// printf("Iter %u: ierr = %u, fwd_fft = %" PRIu64 ", mode_flag = %u\n",iter,ierr,fwd_fft,mode_flag); if((mode_flag >> 1) == 0) { - // fprintf(stderr,"Array = 0x%llX, Iter = %u, Inv-WT: mode_flag = 0x%X\n",(uint64)a,iter,mode_flag); + // fprintf(stderr,"Array = %#" PRIX64 ", Iter = %u, Inv-WT: mode_flag = %#X\n",(uint64)a,iter,mode_flag); func_dit1(a,n); /*...and unweight the data array. */ @@ -1636,7 +1636,7 @@ if(iter < ihi) { ii += ( (-(int)((uint32)ii >> 31)) & nwt);\ } /* Odds: */ - ASSERT(HERE, ii == 0,"fermat_mod_square.c: ii == 0"); + ASSERT(ii == 0,"fermat_mod_square.c: ii == 0"); for(j = 0; j < n; j += 2) { #ifdef USE_AVX512 @@ -1723,7 +1723,7 @@ if(iter < ihi) { // [action] Prior to returning, print a "retry successful" informational and rezero ROE_ITER and ROE_VAL. // *** v20: For PRP-test Must make sure we are at end of checkpoint-file iteration interval, not one of the Gerbicz-update subintervals *** if(!INTERACT && ROE_ITER > 0 && ihi%ITERS_BETWEEN_CHECKPOINTS == 0) { // In interactive (timing-test) mode, use ROE_ITER to accumulate #iters-with-dangerous-ROEs - ASSERT(HERE, (ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!"); + ASSERT((ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!"); ROE_ITER = 0; ROE_VAL = 0.0; sprintf(cbuf,"Retry of iteration interval with fatal roundoff error was successful.\n"); @@ -1789,7 +1789,7 @@ void fermat_process_chunk( dyadic-multiply FFT(a) * FFT(b) and iFFT the product, storing the result in a[]. */ if((fwd_fft & 0xC) != 0) { - ASSERT(HERE, ((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *"); + ASSERT(((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *"); incr = RADIX_VEC[NRADICES-1]<<1; } else { for(i=1; i <= NRADICES-2; i++) @@ -1805,7 +1805,7 @@ void fermat_process_chunk( case 32: radix32_dif_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break; default: - sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } k += mm*radix0; mm *= RADIX_VEC[i]; @@ -1827,7 +1827,7 @@ void fermat_process_chunk( case 32: radix32_dyadic_square(&a[jstart],arr_scratch,n,radix0,rt0,rt1,ii,nradices_prim,radix_prim,incr,init_sse2,thr_id, bptr, cptr); break; default: - sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } #ifdef DBG_TIME clock2 = clock(); @@ -1878,7 +1878,7 @@ void fermat_process_chunk( case 32: radix32_dit_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break; default: - sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } /* end i-loop */ diff --git a/src/fgt_m61.c b/src/fgt_m61.c index 224642e5..36c28181 100755 --- a/src/fgt_m61.c +++ b/src/fgt_m61.c @@ -104,9 +104,9 @@ uint64 prodq8(const uint64 x, const uint64 y) #else MUL_LOHI64(x,y, lo, hi); #endif - ASSERT(HERE, (lo & 7) == 0, "ERROR: product not divisible by 8 in PRODQ8!"); -//if(hi + (lo >> 3) > q2) fprintf(stderr, "PRODQ8 inputs: %llu,%llu, outputs: %llu,%llu, result = %llu\n",x,y,lo,hi,hi + (lo >> 3)); -// ASSERT(HERE, hi + (lo >> 3) <= q2, "ERROR: result out of range in PRODQ8!"); + ASSERT((lo & 7) == 0, "ERROR: product not divisible by 8 in PRODQ8!"); +//if(hi + (lo >> 3) > q2) fprintf(stderr, "PRODQ8 inputs: %" PRIu64 ",%" PRIu64 ", outputs: %" PRIu64 ",%" PRIu64 ", result = %" PRIu64 "\n",x,y,lo,hi,hi + (lo >> 3)); +// ASSERT(hi + (lo >> 3) <= q2, "ERROR: result out of range in PRODQ8!"); return hi + (lo >> 3); // hi + (lo/8) } @@ -116,7 +116,7 @@ uint64 prodq8(const uint64 x, const uint64 y) // [2015: Replace elaborate case-based impl of original with simple MULQ, which is fast on more or less all 64-bit arches.] uint64 mul_by_3bit(const uint64 a, const uint64 x) { - ASSERT(HERE, (x >> 61) == 0, "ERROR: x out of range in MUL_BY_3BIT!"); + ASSERT((x >> 61) == 0, "ERROR: x out of range in MUL_BY_3BIT!"); return a * x; } @@ -130,7 +130,7 @@ uint64 mul_by_3bit(const uint64 a, const uint64 x) // Output bounds: *********************** To-Do! ********************* uint64 rmul_modq(const uint64 a, const uint64 b) { - ASSERT(HERE, a < 0x8000000000000000ull && b < 0x4000000000000000ull, "Input(s) out of range!"); + ASSERT(a < 0x8000000000000000ull && b < 0x4000000000000000ull, "Input(s) out of range!"); return prodq8(a<<1, b<<2); } @@ -197,12 +197,12 @@ uint64 rmul_modq(const uint64 x, const uint64 y) hi4est = (uint64)dhi - 1; error_mod4 = ( (bd_lo >> 62) - hi4est ) & 3ull; // Error mod 4 bd_hi = (hi4est + error_mod4) >> 2; - ASSERT(HERE, bd_lo <= (bd_hi << 3) + bd_lo, "ERROR: overflow of b*d(lo + hi>>3) summand!"); + ASSERT(bd_lo <= (bd_hi << 3) + bd_lo, "ERROR: overflow of b*d(lo + hi>>3) summand!"); bd_modq = qreduce((bd_hi << 3) + bd_lo); ay = mul_by_3bit((x >> 58),y); cb = mul_by_3bit((y >> 58), (x & two58m1)); - ASSERT(HERE, cb <= ay+cb, "ERROR: overflow of ay+cb summand!"); + ASSERT(cb <= ay+cb, "ERROR: overflow of ay+cb summand!"); bd_modq = qreduce((bd_hi << 3) + bd_lo); // Now form [(a*y + c*b)*2^58 + b*d] mod q. @@ -465,24 +465,24 @@ The CMUL_MODQ8 variant assumes the inputs are premultiplied by 8 and thus cuts I void cmul_modq(const uint64 a0, const uint64 a1, const uint64 b0, const uint64 b1, uint64*xout, uint64*yout) { uint64 t00,t01,t11; - ASSERT(HERE, (a0 <= bb && a1 <= bb && b0 <= q && b1 <= q), "ERROR: CMUL_MODQ input out of range!"); + ASSERT((a0 <= bb && a1 <= bb && b0 <= q && b1 <= q), "ERROR: CMUL_MODQ input out of range!"); // Bounds: b0,b1 in [0,q], so 4(b0+b1) in [0,8q]; prodq8lo/8 always in [0, q]. t00 = prodq8(a0, b0<<3); // a0 in [0, B]: prodq8hi(a0,8b0) in [0, q], t00 in [0, 2q] t11 = prodq8(a1, b1<<3); // a1 in [0, B]: prodq8hi(a1,8b1) in [0, q], t11 in [0, 2q] *xout = t00 - t11 + q2; // xout in [0, 4q] - ASSERT(HERE, *xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!"); + ASSERT(*xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!"); #if !KARATSUBA_CMUL // Standard complex 4-multiply: t01 = prodq8(a0, b1<<3); // a0 in [0, B]: prodq8hi(a0,8b1) in [0, q], t01 in [0, 2q] *yout = prodq8(a1, b0<<3) + t01; // a1 in [0, B]: prodq8hi(a1,8b0) in [0, q], t10 in [0, 2q]; yout in [0, 4q] - ASSERT(HERE, *yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!"); + ASSERT(*yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!"); #else // Karatsuba variant: t01 = prodq8((a0 + a1)<<1, (b0 + b1)<<2); // prodq8hi( 2(a0+a1) , 4(b0+b1) ) in [0,4q], t01 in [0, 5q] *yout = qreduce(t01 - t00 - t11 + q4); // t01 in [0, 5q] but t01-t00-t11 in [0,4q], so no overflow in t01-t00-t11+q4. - ASSERT(HERE, t01 <= (q4 + q), "ERROR: t01 > 5q in Karatsuba-part of CMUL_MODQ!"); + ASSERT(t01 <= (q4 + q), "ERROR: t01 > 5q in Karatsuba-part of CMUL_MODQ!"); // This version reduces both parts of the output: #if 0 uint64 tmp = (q<<2) - t11; // tmp in [2q, 4q] @@ -501,19 +501,19 @@ void cmul_modq(const uint64 a0, const uint64 a1, const uint64 b0, const uint64 b void cmul_modq8(const uint64 a0, const uint64 a1, const uint64 b0, const uint64 b1, uint64*xout, uint64*yout) { uint64 t00,t01,t11; - ASSERT(HERE, ( a0 <= bb && a1 <= bb ), "ERROR: CMUL_MODQ8 A-input out of range!"); - ASSERT(HERE, (!(b0 & 7) && !(b1 & 7)), "ERROR: CMUL_MODQ8 B-input not divisible by 8!"); + ASSERT(( a0 <= bb && a1 <= bb ), "ERROR: CMUL_MODQ8 A-input out of range!"); + ASSERT((!(b0 & 7) && !(b1 & 7)), "ERROR: CMUL_MODQ8 B-input not divisible by 8!"); // Bounds: b0,b1 in [0,q], so 4(b0+b1) in [0,8q]; prodq8lo/8 always in [0, q]. t00 = prodq8(a0, b0); // a0 in [0, B]: prodq8hi(a0,8b0) in [0, q], t00 in [0, 2q] t11 = prodq8(a1, b1); // a1 in [0, B]: prodq8hi(a1,8b1) in [0, q], t11 in [0, 2q] *xout = t00 - t11 + q2; // xout in [0, 4q] - ASSERT(HERE, *xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!"); + ASSERT(*xout <= q4, "ERROR: xout > 4q in CMUL_MODQ!"); // Standard complex 4-multiply is only option here: t01 = prodq8(a0, b1); // a0 in [0, B]: prodq8hi(a0,8b1) in [0, q], t01 in [0, 2q] *yout = prodq8(a1, b0) + t01; // a1 in [0, B]: prodq8hi(a1,8b0) in [0, q], t10 in [0, 2q]; yout in [0, 4q] - ASSERT(HERE, *yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!"); + ASSERT(*yout <= q4, "ERROR: yout > 4q in CMUL_MODQ!"); return; } @@ -531,10 +531,10 @@ void csqr_modq(const uint64 a0, const uint64 a1, uint64*xout, uint64*yout) { // This version reduces both parts of the output... *xout = prodq8((a0 + a1)<<1, (a0 - a1 + q)<<2); // prodq8hi( 2(a0+a1) , 4(a0-a1+q) ) in [0,4q]; xout in [0,5q] - ASSERT(HERE, *xout <= (q4+q), "ERROR: xout >= 5q in CSQR_MODQ!"); + ASSERT(*xout <= (q4+q), "ERROR: xout >= 5q in CSQR_MODQ!"); *yout = prodq8(a0<<2, a1<<2); // prodq8hi( 4*a0 , 4*a1 ) in [0,2q]; yout in [0,3q] - ASSERT(HERE, *yout < (q4-q), "ERROR: yout > 3q in CSQR_MODQ!"); + ASSERT(*yout < (q4-q), "ERROR: yout > 3q in CSQR_MODQ!"); } /***************/ @@ -574,12 +574,12 @@ void prim_root_q(const uint64 ord, uint64*root_re, uint64*root_im) uint64 r0,i0,rm,im,rtmp,itmp,pow; // Maximal order (q^2-1) = 2^62 * (2^60-1), allowing power-of-2 roots up to 2^62: - ASSERT(HERE, zbits < 63, "PRIM_ROOT_Q: Maximal power-of-2 roots = 2^62!"); + ASSERT(zbits < 63, "PRIM_ROOT_Q: Maximal power-of-2 roots = 2^62!"); // First raise result to the [(2^60-1)/(ord >> trailz(ord))]th power using LR binary powering: itmp = (1ull << 60) - 1; pow = itmp/(ord >> zbits); // Odd component of the needed power; this should have 0 remainder for legal ord values - ASSERT(HERE, itmp == pow*(ord >> zbits), "pow does not divide 2^60-1!"); + ASSERT(itmp == pow*(ord >> zbits), "pow does not divide 2^60-1!"); pow = pow << (leadz64(pow)+1); // Left-justify pow and shift leftmost bit off. // 6 + I is a primitive root of full order q^2 - 1: diff --git a/src/get_cpuid.c b/src/get_cpuid.c index a305fc27..b1aa3385 100755 --- a/src/get_cpuid.c +++ b/src/get_cpuid.c @@ -162,10 +162,10 @@ __cpuid(CPUInfo, i); #if 0 printf("\nFor InfoType %d\n", i); - printf("CPUInfo[0] = 0x%x\n", CPUInfo[0]); - printf("CPUInfo[1] = 0x%x\n", CPUInfo[1]); - printf("CPUInfo[2] = 0x%x\n", CPUInfo[2]); - printf("CPUInfo[3] = 0x%x\n", CPUInfo[3]); + printf("CPUInfo[0] = %#x\n", CPUInfo[0]); + printf("CPUInfo[1] = %#x\n", CPUInfo[1]); + printf("CPUInfo[2] = %#x\n", CPUInfo[2]); + printf("CPUInfo[3] = %#x\n", CPUInfo[3]); #endif /* Interpret CPU feature information. */ if(i == 1) @@ -199,10 +199,10 @@ { __cpuid(CPUInfo, i); printf("\nFor InfoType %x\n", i); - printf("CPUInfo[0] = 0x%x\n", CPUInfo[0]); - printf("CPUInfo[1] = 0x%x\n", CPUInfo[1]); - printf("CPUInfo[2] = 0x%x\n", CPUInfo[2]); - printf("CPUInfo[3] = 0x%x\n", CPUInfo[3]); + printf("CPUInfo[0] = %#x\n", CPUInfo[0]); + printf("CPUInfo[1] = %#x\n", CPUInfo[1]); + printf("CPUInfo[2] = %#x\n", CPUInfo[2]); + printf("CPUInfo[3] = %#x\n", CPUInfo[3]); /* Interpret CPU brand string and cache information. */ if(i == 0x80000002) diff --git a/src/get_fft_radices.c b/src/get_fft_radices.c index 9e975d97..e939205a 100755 --- a/src/get_fft_radices.c +++ b/src/get_fft_radices.c @@ -2525,7 +2525,7 @@ int get_fft_radices(uint32 kblocks, int radix_set, uint32 *nradices, uint32 radi return ERR_FFTLENGTH_ILLEGAL; } - ASSERT(HERE, rvec[0] <= MAX_RADIX, "Leading radix exceeds value of MAX_RADIX set in Mdata.h file!"); + ASSERT(rvec[0] <= MAX_RADIX, "Leading radix exceeds value of MAX_RADIX set in Mdata.h file!"); // Check that there are at least 2 radices: if(numrad < 2) { @@ -2535,7 +2535,7 @@ int get_fft_radices(uint32 kblocks, int radix_set, uint32 *nradices, uint32 radi /* If user provided a radix array, make sure they gave a valid dimension: */ if(radix_vec) - ASSERT(HERE, radix_vec_dim >= numrad,"get_fft_radices: radix_vec_dim has illegal value!"); + ASSERT(radix_vec_dim >= numrad,"get_fft_radices: radix_vec_dim has illegal value!"); /* Check that N/2 = {product of the radices}, and if valid nradices and radix_vec pointers supplied, copy radices to the latter and numrad to the former: */ @@ -2560,7 +2560,7 @@ int get_fft_radices(uint32 kblocks, int radix_set, uint32 *nradices, uint32 radi if(rad_prod != n/2) { fprintf(stderr,"N = %u, radix_set = %u : product of complex radices %u != (FFT length/2)\n", n, radix_set, rad_prod); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } return 0; @@ -2596,18 +2596,18 @@ void test_fft_radixtables() } else if(retval == ERR_RADIXSET_UNAVAILABLE) { - ASSERT(HERE, radset != 0, "test_fft_radixtables: Should only see ERR_RADIXSET_UNAVAILABLE for nonzero radix set index!"); + ASSERT(radset != 0, "test_fft_radixtables: Should only see ERR_RADIXSET_UNAVAILABLE for nonzero radix set index!"); break; } else if(retval == ERR_FFTLENGTH_ILLEGAL) { fprintf(stderr,"ERROR: illegal FFT length %u K in test_fft_radixtables self-test!\n",kblocks); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } else { fprintf(stderr,"ERROR: unknown return value %d in test_fft_radixtables self-test; i = %d, kblocks = %u, radset = %u.\n", retval, i, kblocks, radset); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } ++i; @@ -2624,18 +2624,18 @@ uint32 get_default_fft_length(uint64 p) uint32 leadingRadixVec[N_LEADING_RADICES] = {8,9,10,11,12,13,14,15}; uint32 i, twoK, fftLen; - ASSERT(HERE, PMAX > PMIN,"get_default_fft_length: PMAX > PMIN"); + ASSERT(PMAX > PMIN,"get_default_fft_length: PMAX > PMIN"); if(p < PMIN || p > PMAX) { - fprintf(stderr,"get_default_fft_length: invalid value for exponent %llu\n",p); - ASSERT(HERE, 0,"0"); + fprintf(stderr,"get_default_fft_length: invalid value for exponent %" PRIu64 "\n",p); + ASSERT(0,"0"); return 0; } /* Starting with N = 1K, Loop over all FFT lengths of form {8,9,10,11,12,13,14,15}*2^m, and return the smallest one for which maxP >= p: */ i = 0; - ASSERT(HERE, 1024%leadingRadixVec[i] == 0,"get_default_fft_length: 1024%leadingRadixVec[0] == 0"); + ASSERT(1024%leadingRadixVec[i] == 0,"get_default_fft_length: 1024%leadingRadixVec[0] == 0"); twoK = 1024/leadingRadixVec[i]; fftLen = leadingRadixVec[i]*twoK; for(;;) @@ -2664,7 +2664,7 @@ uint32 get_default_fft_length(uint64 p) if((fftLen >> 10) == 589824) fprintf(stderr,"get_default_fft_length: Allowing fftLen 576M just for informational purposes ... note this length is not supported.\n"); else - ASSERT(HERE, 0,"get_default_fft_length: fftLen > MAX_FFT_LENGTH_IN_K!"); + ASSERT(0,"get_default_fft_length: fftLen > MAX_FFT_LENGTH_IN_K!"); return 0; } @@ -2679,7 +2679,7 @@ uint32 get_nextlarger_fft_length(uint32 n) if(get_fft_radices((n >> 10), 0, 0x0, 0x0, 0) != 0) { sprintf(cbuf, "get_nextlarger_fft_length: Illegal or Unsupported input FFT length %u\n", n); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } /* Extract leading 4 bits of input FFT lengths, thus decomposing it into the form {8,9,10,11,12,13,14,15}*2^m, @@ -2688,7 +2688,7 @@ uint32 get_nextlarger_fft_length(uint32 n) */ rem2 = 32 - leadz32(n) - 4; lead4 = n >> rem2; - ASSERT(HERE, lead4 > 7 && lead4 < 16,"get_nextlarger_fft_length: leading 4 bits of input FFT length out of range!"); + ASSERT(lead4 > 7 && lead4 < 16,"get_nextlarger_fft_length: leading 4 bits of input FFT length out of range!"); /* Make sure next-larger FFT length is supported: */ ++lead4; diff --git a/src/get_preferred_fft_radix.c b/src/get_preferred_fft_radix.c index 51d07b98..fa7a92c1 100755 --- a/src/get_preferred_fft_radix.c +++ b/src/get_preferred_fft_radix.c @@ -119,23 +119,23 @@ uint32 get_preferred_fft_radix(uint32 kblocks) if(i == kblocks) { if(found) { sprintf(cbuf,"Multiple cfg-file entries for FFT length %uK encountered in %s - please delete or comment out all but one entry for this length, save the file and retry.",kblocks,CONFIGFILE); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } else found = TRUE; } if(sscanf(char_addr + 11, "%lf", &tcurr) == 1) { // 11 chars in "msec/iter =" - ASSERT(HERE, tcurr >= 0, "tcurr < 0!"); + ASSERT(tcurr >= 0, "tcurr < 0!"); if((tbest == 0.0) || ((tcurr > 0.0) && (tcurr < tbest))) { if((char_addr = strstr(in_line, "radices =")) == 0x0) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: invalid format for %s file: 'radices =' not found in timing-data line %s", CONFIGFILE, in_line); - ASSERT(HERE, 0, cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: invalid format for %s file: 'radices =' not found in timing-data line %s", CONFIGFILE, in_line); + ASSERT(0, cbuf); } char_addr += 9; // 9 chars in "radices =" kprod = 1; /* accumulate product of radices */ for(j = 0; j < 10; j++) { /* Read in the radices */ if(sscanf(char_addr, "%d", &k) != 1) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: invalid format for %s file: failed to read %dth element of radix set, offending input line %s", CONFIGFILE, j, in_line); - ASSERT(HERE, 0, cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: invalid format for %s file: failed to read %dth element of radix set, offending input line %s", CONFIGFILE, j, in_line); + ASSERT(0, cbuf); } else { // Advance to next WS char following the current numeric token - since sscanf skips leading WS, // Must do this in 2 steps. NOTE we *need* the trailing ; here to serve as executable-statement @@ -147,15 +147,15 @@ uint32 get_preferred_fft_radix(uint32 kblocks) while( isspace(*char_addr)) char_addr++; // 1. First skip any WS preceding current numeric token while(!isspace(*char_addr)) char_addr++; // 2. Look for first WS char following current numeric token if(j == 0) - ASSERT(HERE, k <= 1024, "get_preferred_fft_radix: Leading radix > 1024: out of range!"); + ASSERT(k <= 1024, "get_preferred_fft_radix: Leading radix > 1024: out of range!"); else if(k) { - ASSERT(HERE, k <= 32 , "get_preferred_fft_radix: Intermediate radix > 32: out of range!"); - ASSERT(HERE, isPow2(k), "get_preferred_fft_radix: Intermediate FFT radix not a power of 2!"); + ASSERT(k <= 32 , "get_preferred_fft_radix: Intermediate radix > 32: out of range!"); + ASSERT(isPow2(k), "get_preferred_fft_radix: Intermediate FFT radix not a power of 2!"); } /* If (i == kblocks), store the data directly into the NRADICES and RADIX_VEC[] globals: */ if(i == kblocks) { if(k == 0) { - ASSERT(HERE, !NRADICES, "Zero terminator of radix set found but NRADICES != 0 ... please check your mlucas.cfg file for duplicate FFT-length entries and remove the unwanted ones, or delete the file and rerun the self-test."); + ASSERT(!NRADICES, "Zero terminator of radix set found but NRADICES != 0 ... please check your mlucas.cfg file for duplicate FFT-length entries and remove the unwanted ones, or delete the file and rerun the self-test."); NRADICES = j; break; } else { @@ -184,20 +184,20 @@ uint32 get_preferred_fft_radix(uint32 kblocks) */ kprod *= 2; if((kprod & 1023) != 0) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: illegal data in %s file: product of complex radices (%d) not a multiple of 1K! Offending input line %s", CONFIGFILE, kprod, in_line); - ASSERT(HERE, 0, cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: illegal data in %s file: product of complex radices (%d) not a multiple of 1K! Offending input line %s", CONFIGFILE, kprod, in_line); + ASSERT(0, cbuf); } kprod >>= 10; tbest = tcurr; if(i == kblocks) { /* Product of radices must equal complex vector length (n/2): */ if(kprod != kblocks) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"get_preferred_fft_radix: mismatching data in %s file: (product of complex radices)/2^10 (%d) != kblocks/2 (%d), offending input line %s", CONFIGFILE, kprod, kblocks/2, in_line); - ASSERT(HERE, 0, cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"get_preferred_fft_radix: mismatching data in %s file: (product of complex radices)/2^10 (%d) != kblocks/2 (%d), offending input line %s", CONFIGFILE, kprod, kblocks/2, in_line); + ASSERT(0, cbuf); } retval = i; /* Preferred FFT length */ } else { - ASSERT(HERE, i == extractFFTlengthFrom32Bit(retval), "get_preferred_fft_radix: i != extractFFTlengthFrom32Bit(retval)!"); + ASSERT(i == extractFFTlengthFrom32Bit(retval), "get_preferred_fft_radix: i != extractFFTlengthFrom32Bit(retval)!"); } } } @@ -207,7 +207,7 @@ uint32 get_preferred_fft_radix(uint32 kblocks) fclose(fp); fp = 0x0; } else { sprintf(cbuf, "CONFIGFILE = %s: open failed -- please run the post-build self-tests as described in the README!", CONFIGFILE); - ASSERT(HERE, 0 , cbuf); + ASSERT(0 , cbuf); } /* Only return nonzero if an entry for the specified FFT length was found. @@ -229,10 +229,10 @@ uint32 extractFFTlengthFrom32Bit (uint32 n) uint32 i, nrad, retval; /* Bits <0:9> store (leading radix-1): We subtract the 1 so radices up to 1024 can be stored: */ retval = (n & 0x3ff) + 1; n >>= 10; - ASSERT(HERE, retval > 4, "extractFFTlengthFrom32Bit: Leading radix must be 5 or larger!"); + ASSERT(retval > 4, "extractFFTlengthFrom32Bit: Leading radix must be 5 or larger!"); /* Bits <10:13> store (number of FFT radices): */ nrad = (n & 0xf) ; n >>= 4; - ASSERT(HERE, nrad >= 3, "extractFFTlengthFrom32Bit: Number of radices must be 3 or larger!"); + ASSERT(nrad >= 3, "extractFFTlengthFrom32Bit: Number of radices must be 3 or larger!"); /* Each successive pair of higher-order bits stores log2[(intermediate FFT radix)/8]: */ for(i = 1; i < nrad; i++) /* Already done leading radix, so start at 1, not 0 */ { @@ -248,12 +248,12 @@ void extractFFTradicesFrom32Bit(uint32 n) uint32 i, nrad, retval; /* Bits <0:9> store (leading radix-1): We subtract the 1 so radices up to 1024 can be stored: */ retval = (n & 0x3ff) + 1; n >>= 10; - ASSERT(HERE, retval > 4, "extractFFTradicesFrom32Bit: Leading radix must be 5 or larger!"); + ASSERT(retval > 4, "extractFFTradicesFrom32Bit: Leading radix must be 5 or larger!"); RADIX_VEC[0] = retval; /* Bits <10:13> store (number of FFT radices): */ nrad = (n & 0xf) ; n >>= 4; - ASSERT(HERE, nrad >= 3, "extractFFTradicesFrom32Bit: Number of radices must be 3 or larger!"); - ASSERT(HERE, nrad <= 10, "extractFFTradicesFrom32Bit: Number of radices must be 10 or smaller!"); + ASSERT(nrad >= 3, "extractFFTradicesFrom32Bit: Number of radices must be 3 or larger!"); + ASSERT(nrad <= 10, "extractFFTradicesFrom32Bit: Number of radices must be 10 or smaller!"); NRADICES = nrad; /* Each successive pair of higher-order bits stores log2[(intermediate FFT radix)/8]: */ for(i = 1; i < 10; i++) /* Already done leading radix, so start at 1, not 0 */ diff --git a/src/imul_macro.c b/src/imul_macro.c index 702a99e2..7e9bc2ea 100755 --- a/src/imul_macro.c +++ b/src/imul_macro.c @@ -283,8 +283,8 @@ int test_mul() #else MUL_LOHI64(in64[i],in64[j], lo1, hi1); #endif - ASSERT(HERE, lo1 == lo0, "test_mul() low-output mismatch!"); - ASSERT(HERE, hi1 == hi0, "test_mul() hi -output mismatch!"); + ASSERT(lo1 == lo0, "test_mul() low-output mismatch!"); + ASSERT(hi1 == hi0, "test_mul() hi -output mismatch!"); /* Squaring is a special case: */ if(i ==j) @@ -294,8 +294,8 @@ int test_mul() #else SQR_LOHI64(in64[i], lo1, hi1); #endif - ASSERT(HERE, lo1 == lo0, "test_mul() low-output mismatch!"); - ASSERT(HERE, hi1 == hi0, "test_mul() hi -output mismatch!"); + ASSERT(lo1 == lo0, "test_mul() low-output mismatch!"); + ASSERT(hi1 == hi0, "test_mul() hi -output mismatch!"); } } } diff --git a/src/imul_macro0.h b/src/imul_macro0.h index b0538c1e..cc7218c2 100755 --- a/src/imul_macro0.h +++ b/src/imul_macro0.h @@ -247,10 +247,10 @@ or the with functions using them (if we declare no _-prepended variables local t hi2 = (uint64)bl2 | ((uint64)bh2 << 32);\ hi3 = (uint64)bl3 | ((uint64)bh3 << 32);\ \ - /*lo0 = (uint64)al0 + ((uint64)ah0 << 32); SQR_LOHI64(x0,&a,&b); if(a != lo0) printf("x,a,lo = %20llu %20llu %20llu\n",x0,a,lo0); if(b != hi0) printf("x,b,hi = %20llu %20llu %20llu\n",x0,b,hi0);*/\ - /*lo1 = (uint64)al1 + ((uint64)ah1 << 32); SQR_LOHI64(x1,&a,&b); if(a != lo1) printf("x,a,lo = %20llu %20llu %20llu\n",x1,a,lo1); if(b != hi1) printf("x,b,hi = %20llu %20llu %20llu\n",x1,b,hi1);*/\ - /*lo2 = (uint64)al2 + ((uint64)ah2 << 32); SQR_LOHI64(x2,&a,&b); if(a != lo2) printf("x,a,lo = %20llu %20llu %20llu\n",x2,a,lo2); if(b != hi2) printf("x,b,hi = %20llu %20llu %20llu\n",x2,b,hi2);*/\ - /*lo3 = (uint64)al3 + ((uint64)ah3 << 32); SQR_LOHI64(x3,&a,&b); if(a != lo3) printf("x,a,lo = %20llu %20llu %20llu\n",x3,a,lo3); if(b != hi3) printf("x,b,hi = %20llu %20llu %20llu\n",x3,b,hi3);*/\ + /*lo0 = (uint64)al0 + ((uint64)ah0 << 32); SQR_LOHI64(x0,&a,&b); if(a != lo0) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0); if(b != hi0) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,b,hi0);*/\ + /*lo1 = (uint64)al1 + ((uint64)ah1 << 32); SQR_LOHI64(x1,&a,&b); if(a != lo1) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1); if(b != hi1) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,b,hi1);*/\ + /*lo2 = (uint64)al2 + ((uint64)ah2 << 32); SQR_LOHI64(x2,&a,&b); if(a != lo2) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2); if(b != hi2) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,b,hi2);*/\ + /*lo3 = (uint64)al3 + ((uint64)ah3 << 32); SQR_LOHI64(x3,&a,&b); if(a != lo3) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3); if(b != hi3) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,b,hi3);*/\ \ /* loj = MULL64(loj,qinvj) : */\ \ @@ -281,10 +281,10 @@ or the with functions using them (if we declare no _-prepended variables local t y2 = (uint64)bl2 | ((uint64)bh2 << 32);\ y3 = (uint64)bl3 | ((uint64)bh3 << 32);\ \ - /*a = MULH64(lo0,q0); if(a != y0) printf("lo,q,hi = %20llu %20llu %20llu\n",x0,a,lo0);*/\ - /*a = MULH64(lo1,q1); if(a != y1) printf("lo,q,hi = %20llu %20llu %20llu\n",x1,a,lo1);*/\ - /*a = MULH64(lo2,q2); if(a != y2) printf("lo,q,hi = %20llu %20llu %20llu\n",x2,a,lo2);*/\ - /*a = MULH64(lo3,q3); if(a != y3) printf("lo,q,hi = %20llu %20llu %20llu\n",x3,a,lo3);*/\ + /*a = MULH64(lo0,q0); if(a != y0) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0);*/\ + /*a = MULH64(lo1,q1); if(a != y1) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1);*/\ + /*a = MULH64(lo2,q2); if(a != y2) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2);*/\ + /*a = MULH64(lo3,q3); if(a != y3) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3);*/\ } /********************************************************************************/ @@ -1225,14 +1225,14 @@ or the with functions using them (if we declare no _-prepended variables local t qil2 = (uint32) qinv2;\ qil3 = (uint32) qinv3;\ \ - DBG_ASSERT(HERE, (ql0 >> 32) == 0,"MOD_INI_Q4: (ql0 >> 32) == 0");\ - DBG_ASSERT(HERE, (ql1 >> 32) == 0,"MOD_INI_Q4: (ql1 >> 32) == 0");\ - DBG_ASSERT(HERE, (ql2 >> 32) == 0,"MOD_INI_Q4: (ql2 >> 32) == 0");\ - DBG_ASSERT(HERE, (ql3 >> 32) == 0,"MOD_INI_Q4: (ql3 >> 32) == 0");\ - DBG_ASSERT(HERE, (qil0 >> 32) == 0,"MOD_INI_Q4: (qil0 >> 32) == 0");\ - DBG_ASSERT(HERE, (qil1 >> 32) == 0,"MOD_INI_Q4: (qil1 >> 32) == 0");\ - DBG_ASSERT(HERE, (qil2 >> 32) == 0,"MOD_INI_Q4: (qil2 >> 32) == 0");\ - DBG_ASSERT(HERE, (qil3 >> 32) == 0,"MOD_INI_Q4: (qil3 >> 32) == 0");\ + DBG_ASSERT((ql0 >> 32) == 0,"MOD_INI_Q4: (ql0 >> 32) == 0");\ + DBG_ASSERT((ql1 >> 32) == 0,"MOD_INI_Q4: (ql1 >> 32) == 0");\ + DBG_ASSERT((ql2 >> 32) == 0,"MOD_INI_Q4: (ql2 >> 32) == 0");\ + DBG_ASSERT((ql3 >> 32) == 0,"MOD_INI_Q4: (ql3 >> 32) == 0");\ + DBG_ASSERT((qil0 >> 32) == 0,"MOD_INI_Q4: (qil0 >> 32) == 0");\ + DBG_ASSERT((qil1 >> 32) == 0,"MOD_INI_Q4: (qil1 >> 32) == 0");\ + DBG_ASSERT((qil2 >> 32) == 0,"MOD_INI_Q4: (qil2 >> 32) == 0");\ + DBG_ASSERT((qil3 >> 32) == 0,"MOD_INI_Q4: (qil3 >> 32) == 0");\ \ qh0 = (uint32)(q0 >> 32);\ qh1 = (uint32)(q1 >> 32);\ @@ -1243,14 +1243,14 @@ or the with functions using them (if we declare no _-prepended variables local t qih2 = (uint32)(qinv2 >> 32);\ qih3 = (uint32)(qinv3 >> 32);\ \ - DBG_ASSERT(HERE, (qh0 >> 32) == 0,"MOD_INI_Q4: (qh0 >> 32) == 0");\ - DBG_ASSERT(HERE, (qh1 >> 32) == 0,"MOD_INI_Q4: (qh1 >> 32) == 0");\ - DBG_ASSERT(HERE, (qh2 >> 32) == 0,"MOD_INI_Q4: (qh2 >> 32) == 0");\ - DBG_ASSERT(HERE, (qh3 >> 32) == 0,"MOD_INI_Q4: (qh3 >> 32) == 0");\ - DBG_ASSERT(HERE, (qih0 >> 32) == 0,"MOD_INI_Q4: (qih0 >> 32) == 0");\ - DBG_ASSERT(HERE, (qih1 >> 32) == 0,"MOD_INI_Q4: (qih1 >> 32) == 0");\ - DBG_ASSERT(HERE, (qih2 >> 32) == 0,"MOD_INI_Q4: (qih2 >> 32) == 0");\ - DBG_ASSERT(HERE, (qih3 >> 32) == 0,"MOD_INI_Q4: (qih3 >> 32) == 0");\ + DBG_ASSERT((qh0 >> 32) == 0,"MOD_INI_Q4: (qh0 >> 32) == 0");\ + DBG_ASSERT((qh1 >> 32) == 0,"MOD_INI_Q4: (qh1 >> 32) == 0");\ + DBG_ASSERT((qh2 >> 32) == 0,"MOD_INI_Q4: (qh2 >> 32) == 0");\ + DBG_ASSERT((qh3 >> 32) == 0,"MOD_INI_Q4: (qh3 >> 32) == 0");\ + DBG_ASSERT((qih0 >> 32) == 0,"MOD_INI_Q4: (qih0 >> 32) == 0");\ + DBG_ASSERT((qih1 >> 32) == 0,"MOD_INI_Q4: (qih1 >> 32) == 0");\ + DBG_ASSERT((qih2 >> 32) == 0,"MOD_INI_Q4: (qih2 >> 32) == 0");\ + DBG_ASSERT((qih3 >> 32) == 0,"MOD_INI_Q4: (qih3 >> 32) == 0");\ } /* For each input xj, calculates the following sequence: @@ -1292,10 +1292,10 @@ or the with functions using them (if we declare no _-prepended variables local t hi2 = (uint64)bl2 | ((uint64)bh2 << 32);\ hi3 = (uint64)bl3 | ((uint64)bh3 << 32);\ /* DEBUG:\ - lo0 = (uint64)al0 + ((uint64)ah0 << 32); SQR_LOHI64(x0,&a,&b); if(a != lo0) printf("x,a,lo = %20llu %20llu %20llu\n",x0,a,lo0); if(b != hi0) printf("x,b,hi = %20llu %20llu %20llu\n",x0,b,hi0); \ - lo1 = (uint64)al1 + ((uint64)ah1 << 32); SQR_LOHI64(x1,&a,&b); if(a != lo1) printf("x,a,lo = %20llu %20llu %20llu\n",x1,a,lo1); if(b != hi1) printf("x,b,hi = %20llu %20llu %20llu\n",x1,b,hi1); \ - lo2 = (uint64)al2 + ((uint64)ah2 << 32); SQR_LOHI64(x2,&a,&b); if(a != lo2) printf("x,a,lo = %20llu %20llu %20llu\n",x2,a,lo2); if(b != hi2) printf("x,b,hi = %20llu %20llu %20llu\n",x2,b,hi2); \ - lo3 = (uint64)al3 + ((uint64)ah3 << 32); SQR_LOHI64(x3,&a,&b); if(a != lo3) printf("x,a,lo = %20llu %20llu %20llu\n",x3,a,lo3); if(b != hi3) printf("x,b,hi = %20llu %20llu %20llu\n",x3,b,hi3); \ + lo0 = (uint64)al0 + ((uint64)ah0 << 32); SQR_LOHI64(x0,&a,&b); if(a != lo0) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0); if(b != hi0) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,b,hi0); \ + lo1 = (uint64)al1 + ((uint64)ah1 << 32); SQR_LOHI64(x1,&a,&b); if(a != lo1) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1); if(b != hi1) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,b,hi1); \ + lo2 = (uint64)al2 + ((uint64)ah2 << 32); SQR_LOHI64(x2,&a,&b); if(a != lo2) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2); if(b != hi2) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,b,hi2); \ + lo3 = (uint64)al3 + ((uint64)ah3 << 32); SQR_LOHI64(x3,&a,&b); if(a != lo3) printf("x,a,lo = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3); if(b != hi3) printf("x,b,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,b,hi3); \ */\ /* loj = MULL64(loj,qinvj) : */\ /* DEBUG:\ @@ -1326,10 +1326,10 @@ or the with functions using them (if we declare no _-prepended variables local t y2 = (uint64)bl2 | ((uint64)bh2 << 32);\ y3 = (uint64)bl3 | ((uint64)bh3 << 32);\ /* DEBUG:\ - a = MULH64(lo0,q0); if(a != y0) printf("lo,q,hi = %20llu %20llu %20llu\n",x0,a,lo0); \ - a = MULH64(lo1,q1); if(a != y1) printf("lo,q,hi = %20llu %20llu %20llu\n",x1,a,lo1); \ - a = MULH64(lo2,q2); if(a != y2) printf("lo,q,hi = %20llu %20llu %20llu\n",x2,a,lo2); \ - a = MULH64(lo3,q3); if(a != y3) printf("lo,q,hi = %20llu %20llu %20llu\n",x3,a,lo3); \ + a = MULH64(lo0,q0); if(a != y0) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x0,a,lo0); \ + a = MULH64(lo1,q1); if(a != y1) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x1,a,lo1); \ + a = MULH64(lo2,q2); if(a != y2) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x2,a,lo2); \ + a = MULH64(lo3,q3); if(a != y3) printf("lo,q,hi = %20" PRIu64 " %20" PRIu64 " %20" PRIu64 "\n",x3,a,lo3); \ */\ } @@ -1375,7 +1375,7 @@ or the with functions using them (if we declare no _-prepended variables local t char s0[21],s1[21];\ uint64 _t,_a,_b;\ \ - ASSERT(HERE, ((uint64)(_y) >> 32) == 0,"MUL64x32: ((_y) >> 32) == 0");\ + ASSERT(((uint64)(_y) >> 32) == 0,"MUL64x32: ((_y) >> 32) == 0");\ MUL_LOHI64((_x), (uint64)(_y), _a, _b);\ \ _lo = ((uint32)((_x) & 0x00000000ffffffff)) * (_y); /* a*c */\ @@ -1390,7 +1390,7 @@ or the with functions using them (if we declare no _-prepended variables local t printf("x = %s, y = %s\n", s0[convert_uint64_base10_char(s0,_x )], s1[convert_uint64_base10_char(s1,_y)]);\ printf("LO= %s, A = %s\n", s0[convert_uint64_base10_char(s0,_lo)], s1[convert_uint64_base10_char(s1,_a)]);\ printf("HI= %s, B = %s\n", s0[convert_uint64_base10_char(s0,_hi)], s1[convert_uint64_base10_char(s0,_b)]);\ - ASSERT(HERE, 0,"0");\ + ASSERT(0,"0");\ }\ } #else diff --git a/src/imul_macro1.h b/src/imul_macro1.h index f1dc277f..d42bdd41 100755 --- a/src/imul_macro1.h +++ b/src/imul_macro1.h @@ -87,8 +87,8 @@ that is advantageous on at least an appreciable set of CPUs. #define ADD160(__x, __y, __sum)\ {\ - DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"ADD160: (__x.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"ADD160: (__y.d2 >> 32) == 0");\ + DBG_ASSERT((__x.d2 >> 32) == 0,"ADD160: (__x.d2 >> 32) == 0");\ + DBG_ASSERT((__y.d2 >> 32) == 0,"ADD160: (__y.d2 >> 32) == 0");\ ADD192(__x, __y, __sum);\ __sum.d2 &= 0x00000000ffffffff; /* In case of add need to take care to get proper mod-2^160 result */\ } @@ -192,8 +192,8 @@ that is advantageous on at least an appreciable set of CPUs. #define SUB160(__x, __y, __dif)\ {\ - DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"SUB160: (__x.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"SUB160: (__y.d2 >> 32) == 0");\ + DBG_ASSERT((__x.d2 >> 32) == 0,"SUB160: (__x.d2 >> 32) == 0");\ + DBG_ASSERT((__y.d2 >> 32) == 0,"SUB160: (__y.d2 >> 32) == 0");\ SUB192(__x, __y, __dif);\ __dif.d2 &= 0x00000000ffffffff; /* In case of add need to take care to get proper mod-2^160 result */\ } @@ -301,7 +301,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. /* Left-shifts: */ #define LSHIFT128(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT128: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"LSHIFT128: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -326,7 +326,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. } #define LSHIFT96(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT96: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"LSHIFT96: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -357,7 +357,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. #define LSHIFT192(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT192: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"LSHIFT192: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -398,14 +398,14 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. } #define LSHIFT160(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"LSHIFT160: ((uint64)__x.d2 >> 32) == 0");\ + DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"LSHIFT160: ((uint64)__x.d2 >> 32) == 0");\ LSHIFT192(__x,__n, __y);\ __y.d2 &= 0x00000000ffffffff;\ } #define LSHIFT256(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT256: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"LSHIFT256: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -469,7 +469,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. /* (Logical) Right-shifts: */ #define RSHIFT128(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT128: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"RSHIFT128: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -494,7 +494,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. } #define RSHIFT96(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT96: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"RSHIFT96: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -530,7 +530,7 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. #define RSHIFT192(__x, __n, __y)\ {\ int __lsh,__rsh;\ - DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT192: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"RSHIFT192: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -576,14 +576,14 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. } #define RSHIFT160(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"RSHIFT160: ((uint64)__x.d2 >> 32) == 0");\ + DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"RSHIFT160: ((uint64)__x.d2 >> 32) == 0");\ RSHIFT192(__x,__n, __y);\ - DBG_ASSERT(HERE, (uint64)__y.d2 <= (uint64)__x.d2,"RSHIFT160: (uint64)__y.d2 <= (uint64)__x.d2");\ + DBG_ASSERT((uint64)__y.d2 <= (uint64)__x.d2,"RSHIFT160: (uint64)__y.d2 <= (uint64)__x.d2");\ } #define RSHIFT256(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"RSHIFT256: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"RSHIFT256: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -647,38 +647,38 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. /* Left-shifts: */ #define LSHIFT_FAST128(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (uint64)__n != 0,"LSHIFT_FAST128: (uint64)__n != 0");\ - DBG_ASSERT(HERE, (uint64)__n < 64,"LSHIFT_FAST128: (uint64)__n < 64");\ + DBG_ASSERT((uint64)__n != 0,"LSHIFT_FAST128: (uint64)__n != 0");\ + DBG_ASSERT((uint64)__n < 64,"LSHIFT_FAST128: (uint64)__n < 64");\ __y.d1 = ((uint64)__x.d1 << __n) + ((uint64)__x.d0 >> (64-__n));\ __y.d0 = ((uint64)__x.d0 << __n);\ } #define LSHIFT_FAST96(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n > 0,"LSHIFT96: (int64)__n > 0");\ - DBG_ASSERT(HERE, (int64)__n < 32,"LSHIFT96: (int64)__n < 32");\ + DBG_ASSERT((int64)__n > 0,"LSHIFT96: (int64)__n > 0");\ + DBG_ASSERT((int64)__n < 32,"LSHIFT96: (int64)__n < 32");\ __y.d1 = ((uint32)__x.d1 << __n) + (uint32)((uint64)__x.d0 >> (64-__n));\ __y.d0 = ((uint64)__x.d0 << __n);\ } #define LSHIFT_FAST192(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (uint64)__n != 0,"LSHIFT_FAST192: (uint64)__n != 0");\ - DBG_ASSERT(HERE, (uint64)__n < 64,"LSHIFT_FAST192: (uint64)__n < 64");\ + DBG_ASSERT((uint64)__n != 0,"LSHIFT_FAST192: (uint64)__n != 0");\ + DBG_ASSERT((uint64)__n < 64,"LSHIFT_FAST192: (uint64)__n < 64");\ __y.d2 = ((uint64)__x.d2 << __n) + ((uint64)__x.d1 >> (64-__n));\ __y.d1 = ((uint64)__x.d1 << __n) + ((uint64)__x.d0 >> (64-__n));\ __y.d0 = ((uint64)__x.d0 << __n);\ } #define LSHIFT_FAST160(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"LSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\ + DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"LSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\ LSHIFT_FAST192(__x,__n, __y);\ __y.d2 &= 0x00000000ffffffff;\ } #define LSHIFT_FAST256(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (uint64)__n != 0,"LSHIFT_FAST256: (uint64)__n != 0");\ - DBG_ASSERT(HERE, (uint64)__n < 64,"LSHIFT_FAST256: (uint64)__n < 64");\ + DBG_ASSERT((uint64)__n != 0,"LSHIFT_FAST256: (uint64)__n != 0");\ + DBG_ASSERT((uint64)__n < 64,"LSHIFT_FAST256: (uint64)__n < 64");\ __y.d3 = ((uint64)__x.d3 << __n) + ((uint64)__x.d2 >> (64-__n));\ __y.d2 = ((uint64)__x.d2 << __n) + ((uint64)__x.d1 >> (64-__n));\ __y.d1 = ((uint64)__x.d1 << __n) + ((uint64)__x.d0 >> (64-__n));\ @@ -688,38 +688,38 @@ If the shift count (__n) is >= the width of the integer type, 0 is returned. /* (Logical) Right-shifts: */ #define RSHIFT_FAST128(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (uint64)__n != 0,"RSHIFT_FAST128: (uint64)__n != 0");\ - DBG_ASSERT(HERE, (uint64)__n < 64,"RSHIFT_FAST128: (uint64)__n < 64");\ + DBG_ASSERT((uint64)__n != 0,"RSHIFT_FAST128: (uint64)__n != 0");\ + DBG_ASSERT((uint64)__n < 64,"RSHIFT_FAST128: (uint64)__n < 64");\ __y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\ __y.d1 = ((uint64)__x.d1 >> __n);\ } #define RSHIFT_FAST96(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n > 0,"RSHIFT96: (int64)__n > 0");\ - DBG_ASSERT(HERE, (int64)__n < 32,"RSHIFT96: (int64)__n < 32");\ + DBG_ASSERT((int64)__n > 0,"RSHIFT96: (int64)__n > 0");\ + DBG_ASSERT((int64)__n < 32,"RSHIFT96: (int64)__n < 32");\ __y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\ __y.d1 = ((uint32)__x.d1 >> __n);\ } #define RSHIFT_FAST192(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (uint64)__n != 0,"RSHIFT_FAST192: (uint64)__n != 0");\ - DBG_ASSERT(HERE, (uint64)__n < 64,"RSHIFT_FAST192: (uint64)__n < 64");\ + DBG_ASSERT((uint64)__n != 0,"RSHIFT_FAST192: (uint64)__n != 0");\ + DBG_ASSERT((uint64)__n < 64,"RSHIFT_FAST192: (uint64)__n < 64");\ __y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\ __y.d1 = ((uint64)__x.d1 >> __n) + ((uint64)__x.d2 << (64-__n));\ __y.d2 = ((uint64)__x.d2 >> __n);\ } #define RSHIFT_FAST160(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, ((uint64)__x.d2 >> 32) == 0,"RSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\ + DBG_ASSERT(((uint64)__x.d2 >> 32) == 0,"RSHIFT_FAST160: ((uint64)__x.d2 >> 32) == 0");\ RSHIFT_FAST192(__x,__n, __y);\ __y.d2 &= 0x00000000ffffffff;\ } #define RSHIFT_FAST256(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (uint64)__n != 0,"RSHIFT_FAST256: (uint64)__n != 0");\ - DBG_ASSERT(HERE, (uint64)__n < 64,"RSHIFT_FAST256: (uint64)__n < 64");\ + DBG_ASSERT((uint64)__n != 0,"RSHIFT_FAST256: (uint64)__n != 0");\ + DBG_ASSERT((uint64)__n < 64,"RSHIFT_FAST256: (uint64)__n < 64");\ __y.d0 = ((uint64)__x.d0 >> __n) + ((uint64)__x.d1 << (64-__n));\ __y.d1 = ((uint64)__x.d1 >> __n) + ((uint64)__x.d2 << (64-__n));\ __y.d2 = ((uint64)__x.d2 >> __n) + ((uint64)__x.d3 << (64-__n));\ @@ -751,7 +751,7 @@ Cast the result of the high-part-equals-zero test to a signed 32-bit (-1) becaus #define LSHIFT96_PTR(__x, __n, __y)\ {\ - DBG_ASSERT(HERE, (int64)__n >= 0,"LSHIFT96_PTR: (int64)__n >= 0");\ + DBG_ASSERT((int64)__n >= 0,"LSHIFT96_PTR: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -896,7 +896,7 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. {\ uint64 __l,__m,__h,__a,__b,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\ __t = (uint64)(__x.d1);\ __h = __t*__t;\ SQR_LOHI64(__x.d0, &__l,&__m);\ @@ -914,7 +914,7 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. {\ uint64 __l,__m,__h,__a,__b,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 31) == 0,"SQR_LOHI95: (__x.d1 >> 31) == 0");\ + DBG_ASSERT((__x.d1 >> 31) == 0,"SQR_LOHI95: (__x.d1 >> 31) == 0");\ __t = (uint64)(__x.d1);\ __h = __t*__t;\ SQR_LOHI64(__x.d0, &__l,&__m);\ @@ -998,7 +998,7 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. {\ uint64 __l,__m,__h,__a,__b;\ uint32 __tt = __x.d1, __hl32,__hh32;\ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI96: (__x.d1 >> 32) == 0");\ MUL64x32(__x.d0,__tt, __a, __b);\ SQR_LOHI64(__x.d0, __l, __m);\ MUL_LOHI32(__tt,__tt,__hl32,__hh32);\ @@ -1025,10 +1025,10 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. uint64 __a0,__a1,__a2,__a3,\ __b0,__b1,__b2,__b3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\ \ SQR_LOHI (__x0.d0, __a0 , __b0 );\ SQR_LOHI (__x1.d0, __a1 , __b1 );\ @@ -1091,10 +1091,10 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. __b0,__b1,__b2,__b3,\ __s0,__s1,__s2,__s3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q4: (__x3.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -1229,10 +1229,10 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. __b0,__b1,__b2,__b3,\ __s0,__s1,__s2,__s3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x0.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x1.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x2.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x3.d1 >> 31) == 0");\ + DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x0.d1 >> 31) == 0");\ + DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x1.d1 >> 31) == 0");\ + DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x2.d1 >> 31) == 0");\ + DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI95_q4: (__x3.d1 >> 31) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -1296,14 +1296,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. uint64 __a0,__a1,__a2,__a3,__a4,__a5,__a6,__a7,\ __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI (__x0.d0, __a0 , __b0 );\ SQR_LOHI (__x1.d0, __a1 , __b1 );\ @@ -1406,14 +1406,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\ __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -1529,14 +1529,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\ __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -1646,14 +1646,14 @@ On Alpha, this needs a total of 5 MUL instructions and 9 ALU ops. __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\ __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x0.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x1.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x2.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x3.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x4.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x5.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x6.d1 >> 31) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x7.d1 >> 31) == 0");\ + DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x0.d1 >> 31) == 0");\ + DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x1.d1 >> 31) == 0");\ + DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x2.d1 >> 31) == 0");\ + DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x3.d1 >> 31) == 0");\ + DBG_ASSERT((__x4.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x4.d1 >> 31) == 0");\ + DBG_ASSERT((__x5.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x5.d1 >> 31) == 0");\ + DBG_ASSERT((__x6.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x6.d1 >> 31) == 0");\ + DBG_ASSERT((__x7.d1 >> 31) == 0,"SQR_LOHI95_q8: (__x7.d1 >> 31) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -1743,8 +1743,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. {\ uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ __s = (uint64)(__x.d1);\ __t = (uint64)(__y.d1);\ __h = __s*__t;\ @@ -1766,8 +1766,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. {\ uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ __s = (uint64)(__x.d1);\ __t = (uint64)(__y.d1);\ __h = __s*__t;\ @@ -1795,8 +1795,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. {\ uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ __s = (uint64)(__x.d1);\ __t = (uint64)(__y.d1);\ __h = __s*__t;\ @@ -1817,8 +1817,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. {\ uint64 __l,__m,__h,__a,__b,__c,__d,__s,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MUL_LOHI96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MUL_LOHI96: (__y.d1 >> 32) == 0");\ __s = (uint64)(__x.d1);\ __t = (uint64)(__y.d1);\ __h = __s*__t;\ @@ -1847,8 +1847,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. {\ uint64 __l,__m;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\ MUL_LOHI64(__x.d0,__y.d0,&__l,&__m);\ __m += __MULL32(__x.d1,__y.d0) + __MULL32(__y.d1,__x.d0); /* Only need the bottom 32 bits of each product here */\ __lo.d0 = __l; __lo.d1 = __m & 0x00000000ffffffff;\ @@ -1894,8 +1894,8 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. {\ uint64 __l,__m;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MULL96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULL96: (__y.d1 >> 32) == 0");\ MUL_LOHI64(__x.d0,__y.d0, __l, __m);\ __m += __MULL32(__x.d1,__y.d0) + __MULL32(__y.d1,__x.d0); /* Only need the bottom 32 bits of each product here */\ __lo.d0 = __l; __lo.d1 = __m & 0x00000000ffffffff;\ @@ -1913,15 +1913,15 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. uint64 __a0,__a1,__a2,__a3,\ __b0,__b1,__b2,__b3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULL96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULL96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULL96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULL96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"MULL96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"MULL96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"MULL96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"MULL96_q4: (__x3.d1 >> 32) == 0");\ \ - DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULL96_q4: (__y0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULL96_q4: (__y1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULL96_q4: (__y2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULL96_q4: (__y3.d1 >> 32) == 0");\ + DBG_ASSERT((__y0.d1 >> 32) == 0,"MULL96_q4: (__y0.d1 >> 32) == 0");\ + DBG_ASSERT((__y1.d1 >> 32) == 0,"MULL96_q4: (__y1.d1 >> 32) == 0");\ + DBG_ASSERT((__y2.d1 >> 32) == 0,"MULL96_q4: (__y2.d1 >> 32) == 0");\ + DBG_ASSERT((__y3.d1 >> 32) == 0,"MULL96_q4: (__y3.d1 >> 32) == 0");\ \ MUL_LOHI64(__x0.d0,__y0.d0, __a0, __b0);\ MUL_LOHI64(__x1.d0,__y1.d0, __a1, __b1);\ @@ -1955,23 +1955,23 @@ On Alpha, this needs a total of 7 MUL, 12 ALU op. uint64 __a0,__a1,__a2,__a3,__a4,__a5,__a6,__a7,\ __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULL96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULL96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULL96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULL96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"MULL96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"MULL96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"MULL96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"MULL96_q8: (__x7.d1 >> 32) == 0");\ - \ - DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULL96_q8: (__y0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULL96_q8: (__y1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULL96_q8: (__y2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULL96_q8: (__y3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y4.d1 >> 32) == 0,"MULL96_q8: (__y4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y5.d1 >> 32) == 0,"MULL96_q8: (__y5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y6.d1 >> 32) == 0,"MULL96_q8: (__y6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y7.d1 >> 32) == 0,"MULL96_q8: (__y7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"MULL96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"MULL96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"MULL96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"MULL96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"MULL96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"MULL96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"MULL96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"MULL96_q8: (__x7.d1 >> 32) == 0");\ + \ + DBG_ASSERT((__y0.d1 >> 32) == 0,"MULL96_q8: (__y0.d1 >> 32) == 0");\ + DBG_ASSERT((__y1.d1 >> 32) == 0,"MULL96_q8: (__y1.d1 >> 32) == 0");\ + DBG_ASSERT((__y2.d1 >> 32) == 0,"MULL96_q8: (__y2.d1 >> 32) == 0");\ + DBG_ASSERT((__y3.d1 >> 32) == 0,"MULL96_q8: (__y3.d1 >> 32) == 0");\ + DBG_ASSERT((__y4.d1 >> 32) == 0,"MULL96_q8: (__y4.d1 >> 32) == 0");\ + DBG_ASSERT((__y5.d1 >> 32) == 0,"MULL96_q8: (__y5.d1 >> 32) == 0");\ + DBG_ASSERT((__y6.d1 >> 32) == 0,"MULL96_q8: (__y6.d1 >> 32) == 0");\ + DBG_ASSERT((__y7.d1 >> 32) == 0,"MULL96_q8: (__y7.d1 >> 32) == 0");\ \ MUL_LOHI64(__x0.d0,__y0.d0, __a0, __b0);\ MUL_LOHI64(__x1.d0,__y1.d0, __a1, __b1);\ @@ -2018,8 +2018,8 @@ neglect of the lower bits, but that seems well below the likely level of hardwar {\ uint64 __m,__h,__a,__b,__c,__d,__s,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\ __s = (uint64)(__x.d1);\ __t = (uint64)(__y.d1);\ __h = __s*__t;\ @@ -2095,8 +2095,8 @@ to get the 16x64==>80-bit intermediate products. {\ uint64 __a,__b,__xlo,__ylo,__xhi,__yhi,__lo;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 16) == 0,"MULH96_80 : (__x.d1 >> 16) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 16) == 0,"MULH96_80 : (__y.d1 >> 16) == 0");\ + DBG_ASSERT((__x.d1 >> 16) == 0,"MULH96_80 : (__x.d1 >> 16) == 0");\ + DBG_ASSERT((__y.d1 >> 16) == 0,"MULH96_80 : (__y.d1 >> 16) == 0");\ __xhi =(__x.d1 << 48) + (__x.d0 >> 16);\ __yhi =(__y.d1 << 48) + (__y.d0 >> 16);\ __xlo = __x.d0 << 48; /* xlo << 48 */\ @@ -2116,8 +2116,8 @@ to get the 16x64==>80-bit intermediate products. {\ uint64 __m,__h,__aa,__bb,__cc,__dd,__s,__t;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MULH96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULH96: (__y.d1 >> 32) == 0");\ __s = (uint64)(__x.d1);\ __t = (uint64)(__y.d1);\ __h = __s*__t;\ @@ -2150,15 +2150,15 @@ to get the 16x64==>80-bit intermediate products. uint64 __a2,__b2,__c2,__d2,__m2,__h2;\ uint64 __a3,__b3,__c3,__d3,__m3,__h3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULH96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULH96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULH96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULH96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"MULH96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"MULH96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"MULH96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"MULH96_q4: (__x3.d1 >> 32) == 0");\ \ - DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH96_q4: (__y0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH96_q4: (__y1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH96_q4: (__y2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH96_q4: (__y3.d1 >> 32) == 0");\ + DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH96_q4: (__y0.d1 >> 32) == 0");\ + DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH96_q4: (__y1.d1 >> 32) == 0");\ + DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH96_q4: (__y2.d1 >> 32) == 0");\ + DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH96_q4: (__y3.d1 >> 32) == 0");\ \ MUL_LOHI32(__x0.d1, __y0.d1, __l32_0, __h32_0);\ MUL_LOHI32(__x1.d1, __y1.d1, __l32_1, __h32_1);\ @@ -2234,23 +2234,23 @@ to get the 16x64==>80-bit intermediate products. uint64 __a6,__b6,__c6,__d6,__m6,__h6;\ uint64 __a7,__b7,__c7,__d7,__m7,__h7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"MULH96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"MULH96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"MULH96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"MULH96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"MULH96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"MULH96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"MULH96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"MULH96_q8: (__x7.d1 >> 32) == 0");\ - \ - DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH96_q8: (__y0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH96_q8: (__y1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH96_q8: (__y2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH96_q8: (__y3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y4.d1 >> 32) == 0,"MULH96_q8: (__y4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y5.d1 >> 32) == 0,"MULH96_q8: (__y5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y6.d1 >> 32) == 0,"MULH96_q8: (__y6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y7.d1 >> 32) == 0,"MULH96_q8: (__y7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"MULH96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"MULH96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"MULH96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"MULH96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"MULH96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"MULH96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"MULH96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"MULH96_q8: (__x7.d1 >> 32) == 0");\ + \ + DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH96_q8: (__y0.d1 >> 32) == 0");\ + DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH96_q8: (__y1.d1 >> 32) == 0");\ + DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH96_q8: (__y2.d1 >> 32) == 0");\ + DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH96_q8: (__y3.d1 >> 32) == 0");\ + DBG_ASSERT((__y4.d1 >> 32) == 0,"MULH96_q8: (__y4.d1 >> 32) == 0");\ + DBG_ASSERT((__y5.d1 >> 32) == 0,"MULH96_q8: (__y5.d1 >> 32) == 0");\ + DBG_ASSERT((__y6.d1 >> 32) == 0,"MULH96_q8: (__y6.d1 >> 32) == 0");\ + DBG_ASSERT((__y7.d1 >> 32) == 0,"MULH96_q8: (__y7.d1 >> 32) == 0");\ \ __h0 = (uint64)__x0.d1*(uint64)__y0.d1;\ __h1 = (uint64)__x1.d1*(uint64)__y1.d1;\ @@ -2973,7 +2973,7 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. {\ uint64 __w0,__w1,__w2,__a,__b;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\ SQR_LOHI64(__x.d0, &__w0,&__w1);\ /* Need to add 2*a*b, so simply double b (which has at most 32 bits) prior to the MUL_LOHI: */\ MUL_LOHI64(__x.d0,__x.d1 << 1,&__a ,&__b );\ @@ -3061,7 +3061,7 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. {\ uint64 __w0,__w1,__w2,__a,__b;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\ SQR_LOHI64(__x.d0, __w0, __w1);\ /* Need to add 2*a*b, so simply double b (which has at most 32 bits) prior to the MUL_LOHI: */\ MUL_LOHI64_ADD(__x.d0, __x.d1 << 1, __w1, __a , __b );\ @@ -3073,7 +3073,7 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. {\ uint64 __w0,__w1,__w2,__a,__b;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"SQR_LOHI128_96: (__x.d1 >> 32) == 0");\ SQR_LOHI64(__x.d0, __w0, __w1);\ /* Need to add 2*a*b, so simply double b (which has at most 32 bits) prior to the MUL_LOHI: */\ MUL_LOHI64(__x.d0, __x.d1 << 1, __a , __b );\ @@ -3099,10 +3099,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. uint64 __a0,__a1,__a2,__a3,\ __b0,__b1,__b2,__b3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\ \ SQR_LOHI (__x0.d0, __a0 , __b0 );\ SQR_LOHI (__x1.d0, __a1 , __b1 );\ @@ -3155,10 +3155,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. __b0,__b1,__b2,__b3,\ __s0,__s1,__s2,__s3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -3215,10 +3215,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. __b0,__b1,__b2,__b3,\ __s0,__s1,__s2,__s3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q4: (__x3.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -3269,10 +3269,10 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. __b0,__b1,__b2,__b3,\ __s0,__s1,__s2,__s3;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI128_95_q4: (__x3.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -3321,14 +3321,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. uint64 __a0,__a1,__a2,__a3,__a4,__a5,__a6,__a7,\ __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI (__x0.d0, __a0 , __b0 );\ SQR_LOHI (__x1.d0, __a1 , __b1 );\ @@ -3413,14 +3413,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\ __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -3509,14 +3509,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\ __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 32) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -3599,14 +3599,14 @@ On Alpha, this needs a total of 5 MUL instructions and 5 ALU ops. __b0,__b1,__b2,__b3,__b4,__b5,__b6,__b7,\ __s0,__s1,__s2,__s3,__s4,__s5,__s6,__s7;\ \ - DBG_ASSERT(HERE, (__x0.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ + DBG_ASSERT((__x0.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x0.d1 >> 32) == 0");\ + DBG_ASSERT((__x1.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x1.d1 >> 32) == 0");\ + DBG_ASSERT((__x2.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x2.d1 >> 32) == 0");\ + DBG_ASSERT((__x3.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x3.d1 >> 32) == 0");\ + DBG_ASSERT((__x4.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x4.d1 >> 32) == 0");\ + DBG_ASSERT((__x5.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x5.d1 >> 32) == 0");\ + DBG_ASSERT((__x6.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x6.d1 >> 32) == 0");\ + DBG_ASSERT((__x7.d1 >> 31) == 0,"SQR_LOHI128_96_q8: (__x7.d1 >> 32) == 0");\ \ SQR_LOHI64(__x0.d0, __a0 , __b0 );\ SQR_LOHI64(__x1.d0, __a1 , __b1 );\ @@ -3671,7 +3671,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on {\ uint64 __w1,__w2,__w3,__a,__b,__c,__d,__cy;\ \ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\ \ MULH64( __x.d0,__y.d0, __w1);\ MUL64x32( __x.d0,__y.d1,&__a ,&__b );\ @@ -3680,7 +3680,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on /* First add [a,b] + [c,d] : since b and d <= 2^64 - 2, can add carryout of a+c sans ripple-carry check: */\ __a += __c;\ __b += __d + (__a < __c);\ - DBG_ASSERT(HERE, (__b >= __d),"MULH128x96: unexpected carryout of __b");\ + DBG_ASSERT((__b >= __d),"MULH128x96: unexpected carryout of __b");\ /* Now add [w1,w2,w3] + [a,b,0]: */\ __w1 += __a;\ __cy = (__w1 < __a);\ @@ -3731,7 +3731,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on {\ uint64 __w1,__w2,__w3,__a,__b,__c,__d,__cy;\ \ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128x96: (__y.d1 >> 32) == 0");\ \ MULH64( __x.d0,__y.d0, __w1);\ MUL64x32( __x.d0,__y.d1, __a , __b );\ @@ -3740,7 +3740,7 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on /* First add [a,b] + [c,d] : since b and d <= 2^64 - 2, can add carryout of a+c sans ripple-carry check: */\ __a += __c;\ __b += __d + (__a < __c);\ - DBG_ASSERT(HERE, (__b >= __d),"MULH128x96: unexpected carryout of __b");\ + DBG_ASSERT((__b >= __d),"MULH128x96: unexpected carryout of __b");\ /* Now add [w1,w2,w3] + [a,b,0]: */\ __w1 += __a;\ __cy = (__w1 < __a);\ @@ -3763,10 +3763,10 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on uint64 __t2,__a2,__b2,__c2,__d2,__cy2;\ uint64 __t3,__a3,__b3,__c3,__d3,__cy3;\ \ - DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH128x96_q4: (__y0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH128x96_q4: (__y1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH128x96_q4: (__y2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH128x96_q4: (__y3.d1 >> 32) == 0");\ + DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH128x96_q4: (__y0.d1 >> 32) == 0");\ + DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH128x96_q4: (__y1.d1 >> 32) == 0");\ + DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH128x96_q4: (__y2.d1 >> 32) == 0");\ + DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH128x96_q4: (__y3.d1 >> 32) == 0");\ \ MULH64( __x0.d0,__y0.d0, __t0);\ MULH64( __x1.d0,__y1.d0, __t1);\ @@ -3851,14 +3851,14 @@ MUL64x32s, i.e. are significantly cheaper than full-blown MUL_LOHIs on uint64 __t6,__a6,__b6,__c6,__d6,__cy6;\ uint64 __t7,__a7,__b7,__c7,__d7,__cy7;\ \ - DBG_ASSERT(HERE, (__y0.d1 >> 32) == 0,"MULH128x96_q8: (__y0.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y1.d1 >> 32) == 0,"MULH128x96_q8: (__y1.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y2.d1 >> 32) == 0,"MULH128x96_q8: (__y2.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y3.d1 >> 32) == 0,"MULH128x96_q8: (__y3.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y4.d1 >> 32) == 0,"MULH128x96_q8: (__y4.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y5.d1 >> 32) == 0,"MULH128x96_q8: (__y5.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y6.d1 >> 32) == 0,"MULH128x96_q8: (__y6.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y7.d1 >> 32) == 0,"MULH128x96_q8: (__y7.d1 >> 32) == 0");\ + DBG_ASSERT((__y0.d1 >> 32) == 0,"MULH128x96_q8: (__y0.d1 >> 32) == 0");\ + DBG_ASSERT((__y1.d1 >> 32) == 0,"MULH128x96_q8: (__y1.d1 >> 32) == 0");\ + DBG_ASSERT((__y2.d1 >> 32) == 0,"MULH128x96_q8: (__y2.d1 >> 32) == 0");\ + DBG_ASSERT((__y3.d1 >> 32) == 0,"MULH128x96_q8: (__y3.d1 >> 32) == 0");\ + DBG_ASSERT((__y4.d1 >> 32) == 0,"MULH128x96_q8: (__y4.d1 >> 32) == 0");\ + DBG_ASSERT((__y5.d1 >> 32) == 0,"MULH128x96_q8: (__y5.d1 >> 32) == 0");\ + DBG_ASSERT((__y6.d1 >> 32) == 0,"MULH128x96_q8: (__y6.d1 >> 32) == 0");\ + DBG_ASSERT((__y7.d1 >> 32) == 0,"MULH128x96_q8: (__y7.d1 >> 32) == 0");\ \ MULH64( __x0.d0,__y0.d0, __t0);\ MULH64( __x1.d0,__y1.d0, __t1);\ @@ -4359,7 +4359,7 @@ On Alpha, this needs a total of 7 MUL instructions and 12 ALU ops. /* First add [a,b] + [c,d] : since b and d <= 2^64 - 2, can add carryout of a+c sans ripple-carry check: */\ __a += __c;\ __b += __d + (__a < __c);\ - DBG_ASSERT(HERE, (__b >= __d),"MULH128: unexpected carryout of __b");\ + DBG_ASSERT((__b >= __d),"MULH128: unexpected carryout of __b");\ /* Now add [w1,w2,w3] + [a,b,0]: */\ __w1 += __a;\ __cy = (__w1 < __a);\ @@ -5327,8 +5327,8 @@ Similarly, (b.y+x.d)>>32 must be added to the MULH128 result. {\ uint64 __a,__b,__c,__d,__lo64;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\ \ __a = (__x.d0) & (uint64)0x00000000ffffffff;\ __b = (__y.d0) & (uint64)0x00000000ffffffff;\ @@ -5349,8 +5349,8 @@ Similarly, (b.y+x.d)>>32 must be added to the MULH128 result. {\ uint64 __a,__b,__c,__d,__lo64;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\ + DBG_ASSERT((__x.d1 >> 32) == 0,"MULH128_96: (__x.d1 >> 32) == 0");\ + DBG_ASSERT((__y.d1 >> 32) == 0,"MULH128_96: (__y.d1 >> 32) == 0");\ \ __a = (__x.d0) & (uint64)0x00000000ffffffff;\ __b = (__y.d0) & (uint64)0x00000000ffffffff;\ @@ -5395,7 +5395,7 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s. {\ uint64 __w0,__w1,__w2,__w3,__w4,__a,__b,__t;\ \ - DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\ + DBG_ASSERT((__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\ /* First calculate high partial products and put into w3 and w4: */\ __t = __x.d2;\ __w4 = __t * __t; /* x2^2 */\ @@ -5462,7 +5462,7 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s. {\ uint64 __w0,__w1,__w2,__w3,__w4,__a,__b,__t;\ \ - DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\ + DBG_ASSERT((__x.d2 >> 32) == 0,"SQR_LOHI160: (__x.d2 >> 32) == 0");\ /* First calculate high partial products and put into w3 and w4: */\ __t = __x.d2;\ __w4 = __t * __t; /* x2^2 */\ @@ -5504,10 +5504,10 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s. __wd0,__wd1,__wd2,__wd3,\ __we0,__we1,__we2,__we3;\ \ - DBG_ASSERT(HERE, (__x0.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x0.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x1.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x2.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x3.d2 >> 32) == 0");\ + DBG_ASSERT((__x0.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x0.d2 >> 32) == 0");\ + DBG_ASSERT((__x1.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x1.d2 >> 32) == 0");\ + DBG_ASSERT((__x2.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x2.d2 >> 32) == 0");\ + DBG_ASSERT((__x3.d2 >> 32) == 0,"SQR_LOHI160_q4: (__x3.d2 >> 32) == 0");\ \ __t0 = __x0.d2;\ __t1 = __x1.d2;\ @@ -5629,14 +5629,14 @@ ALU ops to split the 5 64-bit outputs into a pair of uint160s. __wd0,__wd1,__wd2,__wd3,__wd4,__wd5,__wd6,__wd7,\ __we0,__we1,__we2,__we3,__we4,__we5,__we6,__we7;\ \ - DBG_ASSERT(HERE, (__x0.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x0.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x1.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x1.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x2.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x2.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x3.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x3.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x4.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x4.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x5.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x5.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x6.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x6.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__x7.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x7.d2 >> 32) == 0");\ + DBG_ASSERT((__x0.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x0.d2 >> 32) == 0");\ + DBG_ASSERT((__x1.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x1.d2 >> 32) == 0");\ + DBG_ASSERT((__x2.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x2.d2 >> 32) == 0");\ + DBG_ASSERT((__x3.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x3.d2 >> 32) == 0");\ + DBG_ASSERT((__x4.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x4.d2 >> 32) == 0");\ + DBG_ASSERT((__x5.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x5.d2 >> 32) == 0");\ + DBG_ASSERT((__x6.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x6.d2 >> 32) == 0");\ + DBG_ASSERT((__x7.d2 >> 32) == 0,"SQR_LOHI160_q8: (__x7.d2 >> 32) == 0");\ \ __t0 = __x0.d2;\ __t1 = __x1.d2;\ @@ -5949,8 +5949,8 @@ On 32-bit hardware, take advantage of the fact that x2 and y2 are only 32 bits w {\ uint64 __w1,__w2,__w3,__w4,__a,__b,__c,__d,__e,__f,__g,__h,__i,__j,__k,__l;\ \ - DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\ + DBG_ASSERT((__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\ + DBG_ASSERT((__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\ \ __w4 = __x.d2*__y.d2; /* x2*y2 */\ MULH64(__x.d0,__y.d0, __w1); /* x0*y0.hi */\ @@ -6033,8 +6033,8 @@ On 32-bit hardware, take advantage of the fact that x2 and y2 are only 32 bits w {\ uint64 __w1,__w2,__w3,__w4,__a,__b,__c,__d,__e,__f,__g,__h,__i,__j,__k,__l;\ \ - DBG_ASSERT(HERE, (__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\ - DBG_ASSERT(HERE, (__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\ + DBG_ASSERT((__x.d2 >> 32) == 0,"MULH160: (__x.d2 >> 32) == 0");\ + DBG_ASSERT((__y.d2 >> 32) == 0,"MULH160: (__y.d2 >> 32) == 0");\ \ __w4 = __x.d2*__y.d2; /* x2*y2 */\ MULH64(__x.d0,__y.d0, __w1); /* x0*y0.hi */\ diff --git a/src/masterdefs.h b/src/masterdefs.h index e74b70a2..cfb95084 100755 --- a/src/masterdefs.h +++ b/src/masterdefs.h @@ -56,7 +56,7 @@ in util.c), otherwise alias the entire 4-argument DBG_ASSERT invocation to "Boli #define DBG_WARN WARN #define DBG_INFO INFO #else /* Bolivian - lump both the FILE and LINE args together as a single __here, that's why it looks like these take 1 less arg than the underlying functions: */ - #define DBG_ASSERT(__here, __arg2, __arg3) /* */ + #define DBG_ASSERT(__arg1, __arg2) /* */ #define DBG_WARN(__here, __arg2, __arg3, __arg4) /* */ #define DBG_INFO(__here, __arg2, __arg3, __arg4) /* */ #endif diff --git a/src/mers_mod_square.c b/src/mers_mod_square.c index 6d35333b..acc20a51 100644 --- a/src/mers_mod_square.c +++ b/src/mers_mod_square.c @@ -221,7 +221,7 @@ The scratch array (2nd input argument) is only needed for data table initializat // v20: got rid of 1st constraint, so we can use a single mode_flag value in p-1 stage 2 for both vecs we want to fwd-FFT-only // but input in fwd-FFT-pass-1-already-done mode and ones where we do both FFTs, input in said form and left so on return: // if(fwd_fft == 1ull) - // ASSERT(HERE, mode_flag < 2, "Only low bit of mode_flag field may be used in this case!"); + // ASSERT(mode_flag < 2, "Only low bit of mode_flag field may be used in this case!"); } /* These came about as a result of multithreading, but now are needed whether built unthreaded or multithreaded */ @@ -247,7 +247,7 @@ The scratch array (2nd input argument) is only needed for data table initializat radix0 = RADIX_VEC[0]; nchunks = radix0>>1; - ASSERT(HERE, TRANSFORM_TYPE == REAL_WRAPPER, "mers_mod_square: Incorrect TRANSFORM_TYPE!"); + ASSERT(TRANSFORM_TYPE == REAL_WRAPPER, "mers_mod_square: Incorrect TRANSFORM_TYPE!"); /*...initialize things upon first entry */ @@ -272,7 +272,7 @@ The scratch array (2nd input argument) is only needed for data table initializat { if(!arr_scratch) { sprintf(cbuf, "Init portion of %s requires non-null scratch array!",func); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } first_entry=FALSE; psave = p; @@ -284,7 +284,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(RADIX_VEC[i] == 0) { sprintf(cbuf, "%s: RADIX_VEC[i = %d] zero, for i < [NRADICES = %d]!",func,i,NRADICES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } radix_set_save[i] = RADIX_VEC[i]; } @@ -293,7 +293,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(RADIX_VEC[i] != 0) { sprintf(cbuf, "%s: RADIX_VEC[i = %d] nonzero, for i >= [NRADICES = %d]!",func,i,NRADICES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } radix_set_save[i] = 0; } @@ -304,12 +304,12 @@ The scratch array (2nd input argument) is only needed for data table initializat /* My array padding scheme requires N/radix0 to be a power of 2, and to be >= 2^DAT_BITS, where the latter parameter is set in the Mdata.h file: */ if(n%radix0 != 0) { - sprintf(cbuf ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf ,"ERROR: radix0 does not divide N!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Make sure n/radix0 is a power of 2: */ i = n/radix0; if((i >> trailz32(i)) != 1) { - sprintf(cbuf ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf ,"ERROR: n/radix0 not a power of 2!\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } if(DAT_BITS < 31) @@ -317,7 +317,7 @@ The scratch array (2nd input argument) is only needed for data table initializat /* Now make sure n/radix0 is sufficiently large (unless n < 2^DAT_BITS, in which case it doesn't matter): */ if(i < (1 << DAT_BITS)) { - // sprintf(cbuf ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS)); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + // sprintf(cbuf ,"ERROR: n/radix0 must be >= %u!\n", (1 << DAT_BITS)); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); // Mar 2018: Switch to 'soft' assertion error here, e.g. for timing tests at small FFT lengths: sprintf(cbuf ,"n/radix0 must be >= %u! Skipping this radix combo.\n", (1 << DAT_BITS)); WARN(HERE, cbuf, "", 1); return(ERR_ASSERT); } @@ -327,7 +327,7 @@ The scratch array (2nd input argument) is only needed for data table initializat { sprintf(cbuf ,"ERROR: final FFT radix may not exceed = %u!\n", (1 << (DAT_BITS-1))); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } @@ -360,7 +360,7 @@ The scratch array (2nd input argument) is only needed for data table initializat { sprintf(cbuf ,"ERROR: product of radices not equal to complex vector length\n"); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* index = (int *)calloc(k,sizeof(int)); */ @@ -369,7 +369,7 @@ The scratch array (2nd input argument) is only needed for data table initializat { sprintf(cbuf ,"ERROR: unable to allocate array INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } index = ALIGN_INT(index_ptmp); @@ -576,7 +576,7 @@ The scratch array (2nd input argument) is only needed for data table initializat default : sprintf(cbuf ,"ERROR: radix %d not available. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } for(i = 1; i < NRADICES; i++) @@ -615,7 +615,7 @@ The scratch array (2nd input argument) is only needed for data table initializat default : sprintf(cbuf ,"ERROR: intermediate radix %d not available. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* Final radix must be 16 or 32: */ @@ -623,7 +623,7 @@ The scratch array (2nd input argument) is only needed for data table initializat { sprintf(cbuf ,"ERROR: final radix %d not available. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } nradices_prim = l; for( ; l < 30; l++) { radix_prim[l] = 0; } // Zero any higher elements which may have been previously set due @@ -654,7 +654,7 @@ The scratch array (2nd input argument) is only needed for data table initializat { sprintf(cbuf ,"ERROR: NWT does not divide N!\n"); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /*...The roots arrays need only be half the dimension of the weights arrays (since we need n/2 complex roots @@ -665,10 +665,10 @@ The scratch array (2nd input argument) is only needed for data table initializat tmp = (double *)calloc(n/nwt+1 ,sizeof(double)); si = ( int *)calloc(nwt+1 ,sizeof( int)); */ - wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt+1 ); if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp); - wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, n/nwt+radix0 ); if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp); - tmp_ptmp = ALLOC_DOUBLE(tmp_ptmp, n/nwt+1 ); if(!tmp_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array TMP in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }; tmp = ALIGN_DOUBLE(tmp_ptmp); - si_ptmp = ALLOC_INT ( si_ptmp, nwt+1 ); if(!si_ptmp ){ sprintf(cbuf,"ERROR: unable to allocate array SI in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }; si = ALIGN_INT (si_ptmp ); + wt0_ptmp = ALLOC_DOUBLE(wt0_ptmp, nwt+1 ); if(!wt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); }; wt0 = ALIGN_DOUBLE(wt0_ptmp); + wt1_ptmp = ALLOC_DOUBLE(wt1_ptmp, n/nwt+radix0 ); if(!wt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array WT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); }; wt1 = ALIGN_DOUBLE(wt1_ptmp); + tmp_ptmp = ALLOC_DOUBLE(tmp_ptmp, n/nwt+1 ); if(!tmp_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array TMP in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); }; tmp = ALIGN_DOUBLE(tmp_ptmp); + si_ptmp = ALLOC_INT ( si_ptmp, nwt+1 ); if(!si_ptmp ){ sprintf(cbuf,"ERROR: unable to allocate array SI in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); }; si = ALIGN_INT (si_ptmp ); /******************************************************************/ /* Crandall/Fagin weighting factors and number of bits per digit. */ @@ -702,7 +702,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QWT1= %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } simodn=0; @@ -722,7 +722,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QWT = %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } wt0 [i] = t1; /* Ith DWT weight factor = 2^[(s*i mod N)/N], where the exponent is done using floating divide. */ @@ -765,7 +765,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QWT2= %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } j=0; /* Store I*K mod NN here. We don't directly calculate I*K, since that can overflow a 32-bit integer at large runlengths. */ @@ -785,7 +785,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QWT = %20.15f, DWT = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } tmp[i] = t1; /*fprintf(stderr,"I = %d; TMP = %20.10f\n",i,tmp[i]); */ @@ -837,7 +837,7 @@ The scratch array (2nd input argument) is only needed for data table initializat (i.e. will be accessed using the lower lg(NRT) bits of the integer sincos index): */ rt0_ptmp = ALLOC_COMPLEX(rt0_ptmp, NRT); - if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0 = ALIGN_COMPLEX(rt0_ptmp); qt = i64_to_q((int64)N2); @@ -861,7 +861,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QCOS1= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } t1 = qfdbl(qi); @@ -878,7 +878,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QSIN1= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qt = QZRO; @@ -900,7 +900,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0[i].re = t1; @@ -919,7 +919,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: I = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0[i].im = t1; @@ -936,7 +936,7 @@ The scratch array (2nd input argument) is only needed for data table initializat (and will be accessed using the upper bits, , of the integer sincos index): */ rt1_ptmp = ALLOC_COMPLEX(rt1_ptmp, n/(2*NRT)); - if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1 = ALIGN_COMPLEX(rt1_ptmp); qn = i64_to_q((int64)NRT); @@ -962,7 +962,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QCOS2= %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } t1 = qfdbl(qi); @@ -979,7 +979,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: QSIN2= %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } qt = QZRO; @@ -1002,7 +1002,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QCOS = %20.15f, DCOS = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1[i].re = t1; @@ -1021,7 +1021,7 @@ The scratch array (2nd input argument) is only needed for data table initializat if(idiff > max_idiff) max_idiff = idiff; sprintf(cbuf,"INFO: J = %8d: QSIN = %20.15f, DSIN = %20.15f DIFFER BY %20.0f\n", i, t1, t2, (double)idiff); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1[i].im = t1; //if((i & 63) ==0)printf("rt1[%3u] = %20.15f, %20.15f\n",i,rt1[i].re,rt1[i].im); @@ -1130,7 +1130,7 @@ for(i=0; i < NRT; i++) { /* 8/23/2004: Need to allocate an extra element here to account for the padding element that gets inserted when radix0 is odd: */ block_index = (int *)calloc((radix0+1),sizeof(int)); - if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Examples: @@ -1214,7 +1214,7 @@ for(i=0; i < NRT; i++) { */ for(j = 0; j < 2; j++) { - if(!(l >= 0 && l < radix0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!(l >= 0 && l < radix0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } if((blocklen & 1) && j == 1) { @@ -1248,14 +1248,14 @@ for(i=0; i < NRT; i++) { } /* End of Main loop */ /* arrays storing the index values needed for the parallel-block wrapper/square scheme: */ - if( !(ws_i = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_j1 = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_j2 = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_j2_start = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_k = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_m = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_blocklen = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_blocklen_sum = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if( !(ws_i = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_j1 = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_j2 = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_j2_start = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_k = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_m = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_blocklen = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_blocklen_sum = (int *)calloc(radix0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } for(ii = 0; ii < radix0; ii += 2) { @@ -1292,7 +1292,7 @@ for(i=0; i < NRT; i++) { */ default : sprintf(cbuf,"ERROR: radix %d not available for wrapper_square. Halting...\n",RADIX_VEC[NRADICES-1]); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } } @@ -1302,7 +1302,7 @@ for(i=0; i < NRT; i++) { fprintf(stderr, "%s:\n",func); fprintf(stderr, " Max abs error between real*8 and real*16 computed values = %20.15f\n", max_adiff); fprintf(stderr, " Max bit error between real*8 and real*16 computed values = %20.0f \n", (double)max_idiff); - ASSERT(HERE, (max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting."); + ASSERT((max_adiff < 100*err_threshold),"Max error between real*8 and real*16 unacceptably high - quitting."); } #ifdef MULTITHREAD @@ -1346,13 +1346,13 @@ for(i=0; i < NRT; i++) { // Threadpool-based dispatch: // MAX_THREADS is the max. no. of threads we expect to be able to make use of, at 1 thread per core. - ASSERT(HERE, MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!"); + ASSERT(MAX_THREADS == get_num_cores(), "MAX_THREADS not set or incorrectly set!"); if(nchunks % NTHREADS != 0) fprintf(stderr,"%s: radix0/2 not exactly divisible by NTHREADS - This will hurt performance.\n",func); main_work_units = 0; pool_work_units = nchunks; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("%s: Init threadpool of %d threads\n",func,NTHREADS); #endif // MULTITHREAD? @@ -1671,11 +1671,11 @@ for(i=0; i < NRT; i++) { // v19: Add support for mod_mul with one input being in precomputed fwd-FFTed form: #ifdef MULTITHREAD for(i = 0; i < nchunks; ++i) { tdat[i].arrdat = a; tdat[i].fwd_fft = fwd_fft; tdat[i].c = c; } -// printf("Thread 0: arrdat = 0x%llX, fwd_fft = 0x%llX\n",tdat[0].arrdat,tdat[0].fwd_fft); +// printf("Thread 0: arrdat = %#" PRIX64 ", fwd_fft = %#" PRIX64 "\n",tdat[0].arrdat,tdat[0].fwd_fft); #endif /*...Init clock counter: */ - ASSERT(HERE, tdiff != 0,"mers_mod_square.c: NULL tdiff ptr!"); + ASSERT(tdiff != 0,"mers_mod_square.c: NULL tdiff ptr!"); #ifdef CTIME clock1 = clock(); @@ -1693,12 +1693,12 @@ for(i=0; i < NRT; i++) { */ // Sep 2019: Add support for fwd_fft_only|mode_flag as described in top-of-function comments if(fwd_fft == 2ull) { - // fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = 0x%llX: jumping directly to undo_initial_ffft_pass.\n",ilo,ihi,(uint64)a); + // fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = %#" PRIX64 ": jumping directly to undo_initial_ffft_pass.\n",ilo,ihi,(uint64)a); goto undo_initial_ffft_pass; } if((mode_flag & 1) == 0) { -// if(ihi<1000 && !fwd_fft)fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = 0x%llX, Fwd-WT: mode_flag = 0x%X, a[1] = %18.10f\n",ilo,ihi,(uint64)a,mode_flag,a[1]); +// if(ihi<1000 && !fwd_fft)fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = %#" PRIX64 ", Fwd-WT: mode_flag = %#X, a[1] = %18.10f\n",ilo,ihi,(uint64)a,mode_flag,a[1]); // Mar 2017: Can skip this step if it's the start of a production test (note that any initial-residue shift // in such cases is handled via single-array-word forward-DWT-weighting in the Mlucas.c shift_word() function), // but need it if add RNG-input-setting above for debug, hence also check a[1] for nonzero: @@ -1729,10 +1729,10 @@ for(i=0; i < NRT; i++) { simodn += sw; if(simodn >= n) simodn -= n; bimodn += bw; if(bimodn >= n) bimodn -= n; // if(simodn != n - bimodn) printf("I = %d: simodn[%u] != n - bimodn[%u]\n",i,simodn,n - bimodn); - // ASSERT(HERE, simodn == n - bimodn, "simodn != n - bimodn"); <*** cannot require this because (for i = n-1) have simodn = 0, bimodn = n, - ASSERT(HERE, DNINT(a[j]) == a[j],"mers_mod_square.c: Input a[j] noninteger!"); + // ASSERT(simodn == n - bimodn, "simodn != n - bimodn"); <*** cannot require this because (for i = n-1) have simodn = 0, bimodn = n, + ASSERT(DNINT(a[j]) == a[j],"mers_mod_square.c: Input a[j] noninteger!"); fracmax = fabs( wt*wtinv*radix0 - 1.0 ); - ASSERT(HERE, fracmax < 1e-10, "wt*wtinv check failed!"); + ASSERT(fracmax < 1e-10, "wt*wtinv check failed!"); a[j] *= wt; ii =((uint32)(sw - bimodn) >> 31); } @@ -1756,7 +1756,7 @@ for(i=0; i < NRT; i++) { */ ierr = 0; /* Any return-value error code (whether fatal or not) stored here */ - ASSERT(HERE, ihi > ilo,"mers_mod_square.c: ihi <= ilo!"); + ASSERT(ihi > ilo,"mers_mod_square.c: ihi <= ilo!"); #if DBG_THREADS fprintf(stderr,"%s: NTHREADS = %3d\n",func,NTHREADS); @@ -1798,7 +1798,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) while(tpool->free_tasks_queue.num_tasks != pool_work_units) { // sleep(1); //*** too granular *** // Finer-resolution, declared in ; cf. http://linux.die.net/man/2/nanosleep - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!"); // printf("sleep; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); } // printf("end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1814,7 +1814,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) #endif if(fwd_fft == 1) { - // fprintf(stderr,"[ilo,ihi] = [%u,%u]: fwd_fft = %llu, mode_flag = %u: exiting after fwd-FFT.\n",ilo,ihi,fwd_fft,mode_flag); + // fprintf(stderr,"[ilo,ihi] = [%u,%u]: fwd_fft = %" PRIu64 ", mode_flag = %u: exiting after fwd-FFT.\n",ilo,ihi,fwd_fft,mode_flag); return 0; // Skip carry step [and preceding inverse-FFT] in this case } // Update RES_SHIFT via mod-doubling, *** BUT ONLY IF IT'S AN AUTOSQUARE ***: @@ -1931,7 +1931,7 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) ierr = radix4096_ditN_cy_dif1 (a,n,nwt,nwt_bits,wt0,wt1,si,0x0,0x0,base,baseinv,iter,&fracmax,p); break; */ default : - sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix0); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } // v19: Nonzero exit carries used to be fatal, added retry-from-last-savefile handling for these @@ -2025,15 +2025,15 @@ for(iter=ilo+1; iter <= ihi && MLUCAS_KEEP_RUNNING; iter++) // On early-exit-due-to-interrupt, decrement iter since we didn't actually do the (iter)th iteration if(!MLUCAS_KEEP_RUNNING) { iter--; -// fprintf(stderr,"%s: fwd_fft_only = 0x%016X, fwd_fft = %X; Caught interrupt at iter = %u; --iter = %u\n",func,fwd_fft_only,fwd_fft,iter+1,iter); +// fprintf(stderr,"%s: fwd_fft_only = %#016X, fwd_fft = %X; Caught interrupt at iter = %u; --iter = %u\n",func,fwd_fft_only,fwd_fft,iter+1,iter); } if(iter < ihi) { - ASSERT(HERE, !MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!"); + ASSERT(!MLUCAS_KEEP_RUNNING, "Premature iteration-loop exit due to unexpected condition!"); ierr = ERR_INTERRUPT; ROE_ITER = iter; // Function return value used for error code, so save number of last-iteration-completed-before-interrupt here -// fprintf(stderr,"Caught signal at iter = %u; mode_flag = 0x%X\n",iter,mode_flag); +// fprintf(stderr,"Caught signal at iter = %u; mode_flag = %#X\n",iter,mode_flag); mode_flag &= 0xfffffffd; // v20: In case of interrupt-exit override any mode_flag "skip undo of initial DIF pass" setting -// fprintf(stderr,"After ^2-toggle, mode_flag = 0x%X, (mode_flag >> 1) = 0x%X\n",mode_flag,mode_flag>>1); +// fprintf(stderr,"After ^2-toggle, mode_flag = %#X, (mode_flag >> 1) = %#X\n",mode_flag,mode_flag>>1); } #ifdef RTIME @@ -2060,7 +2060,7 @@ if(iter < ihi) { if((mode_flag >> 1) == 0) { - // fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = 0x%llX, Inv-WT: mode_flag = 0x%X\n",ilo,ihi,(uint64)a,mode_flag); + // fprintf(stderr,"[ilo,ihi] = [%u,%u], Array = %#" PRIX64 ", Inv-WT: mode_flag = %#X\n",ilo,ihi,(uint64)a,mode_flag); func_dit1(a,n); /*...and unweight the data array. */ @@ -2127,7 +2127,7 @@ if(iter < ihi) { // [action] Prior to returning, print a "retry successful" informational and rezero ROE_ITER and ROE_VAL. // *** v19: For PRP-test Must make sure we are at end of checkpoint-file iteration interval, not one of the Gerbicz-update subintervals *** if(!INTERACT && ROE_ITER > 0 && ihi%ITERS_BETWEEN_CHECKPOINTS == 0) { // In interactive (timing-test) mode, use ROE_ITER to accumulate #iters-with-dangerous-ROEs - ASSERT(HERE, (ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!"); + ASSERT((ierr == 0) && (iter = ihi+1), "[2a] sanity check failed!"); ROE_ITER = 0; ROE_VAL = 0.0; sprintf(cbuf,"Retry of iteration interval with fatal roundoff error was successful.\n"); @@ -2196,7 +2196,7 @@ void mers_process_chunk( dyadic-multiply FFT(a) * FFT(b) and iFFT the product, storing the result in a[]. */ if((fwd_fft & 0xC) != 0) { - ASSERT(HERE, ((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *"); + ASSERT(((fwd_fft & 0xF) == 0xC) && ((fwd_fft>>4) != 0x0), "Bits 2:3 of fwd_fft == 3: Expect Bits 0:1 == 0 and nonzero b[] = hi60! *"); } else { for(j = 0; j < jhi; j++) { @@ -2226,7 +2226,7 @@ void mers_process_chunk( case 32 : radix32_dif_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break; default : - sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } k += mm*radix0; @@ -2262,7 +2262,7 @@ void mers_process_chunk( radix64_wrapper_square(a,arr_scratch,n,radix0,rt0,rt1,nradices_prim,radix_prim, ws_i[l], ws_j1[l], ws_j2[l], ws_j2_start[l], ws_k[l], ws_m[l], ws_blocklen[l], ws_blocklen_sum[l],init_sse2,thr_id, fwd_fft, c); break; */ default : - sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for wrapper/square. Halting...\n",RADIX_VEC[NRADICES-1]); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } @@ -2286,7 +2286,7 @@ void mers_process_chunk( { /* Get block index of the chunk of contiguous data to be processed: */ l = block_index[ii + j]; - ASSERT(HERE, l >= 0,"mers_mod_square.c: l >= 0"); + ASSERT(l >= 0,"mers_mod_square.c: l >= 0"); /* Quick-n-dirty way of generating the correct starting values of k, mm and incr - simply use the skeleton of the forward (DIF) loop, sans the i = NRADICES-2 pass @@ -2330,7 +2330,7 @@ void mers_process_chunk( case 32 : radix32_dit_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break; default : - sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } /* end i-loop */ diff --git a/src/mi64.c b/src/mi64.c index 08bb5c88..1a031a4f 100755 --- a/src/mi64.c +++ b/src/mi64.c @@ -78,7 +78,7 @@ __device__ uint32 mi64_twopmodq_gpu( hi = gpu_thread_local + lenQ*6; scratch = hi + lenQ; scratch2 = hi + lenQ*2; - cyout = mi64_mul_scalar(p,k<<1,q,lenQ); ASSERT(HERE, 0 == cyout, "unexpected carryout of 2*p*k!"); + cyout = mi64_mul_scalar(p,k<<1,q,lenQ); ASSERT(0 == cyout, "unexpected carryout of 2*p*k!"); q[0] += 1; // q = 2.k.p + 1; No need to check for carry since 2.k.p even mi64_shrl_short(q, qhalf, 1, lenQ); /* (q >> 1) = (q-1)/2, since q odd. */ @@ -163,7 +163,7 @@ __device__ uint32 mi64_twopmodq_gpu( return mi64_cmp_eq_scalar(x, 1ull, lenQ); #else // ifndef __CUDA_ARCH__ - ASSERT(HERE, 0, "Device code being called in host mode!"); + ASSERT(0, "Device code being called in host mode!"); return 0; #endif } @@ -208,7 +208,7 @@ void mi64_brev(uint64 x[], uint32 n) x[wi] &= ~mi; x[wj] &= ~mj; // Mask off the bits to be swapped x[wi] ^= bj<> 6), rembits = (nshift & 63), m64bits; uint64 lo64 = 0ull; - ASSERT(HERE, len != 0, "mi64_shl: zero-length array!"); + ASSERT(len != 0, "mi64_shl: zero-length array!"); // Special-casing for 0 shift count: if(!nshift) { if(x != y) mi64_set_eq(y, x, len); // Set y = x @@ -415,13 +415,13 @@ __device__ #endif void mi64_shlc(const uint64 x[], uint64 y[], uint32 nbits, uint32 nshift, uint32 len, uint32 sign_flip) { /**** NOTE: The (nbits+63) here means the largest exponent currently testable is 4294967231 = 2^32-65, larger ones like 4294967291 = 2^32-5 overflow uint32 => nwmod = 0 ****/ - ASSERT(HERE, nshift <= nbits && (nbits+63) <= 0xFFFFFFFFu, "mi64_shlc: Require (nshift <= nbits) and (nbits+63) < 2^32!"); + ASSERT(nshift <= nbits && (nbits+63) <= 0xFFFFFFFFu, "mi64_shlc: Require (nshift <= nbits) and (nbits+63) < 2^32!"); uint32 i = nbits&63, nwshift = (nshift+63) >> 6, nwmod = ((nbits + 63)>>6); // Here nwshift includes any partial words in addition to fullwords - ASSERT(HERE, x && len, "mi64_shlc: null input pointer or zero-length array!"); + ASSERT(x && len, "mi64_shlc: null input pointer or zero-length array!"); // W/o the extra "& (nbits&63)" this assumes nbits != 0, i.e. unsuitable for Fermats: uint64 cy, mask64 = (-1ull << i) & -(uint64)(i != 0); // = (-1ull << i) if Mersenne, 0 if Fermat - ASSERT(HERE, (x[len-1] & mask64) == 0ull, "mi64_shlc: x[] has set bits beyond [nbits] position in high word!"); -// printf("mi64_shlc: %u bits, %u limbs, mask64 = 0x%llX, high limb = 0x%llX\n",nbits,len,mask64,x[len-1]); + ASSERT((x[len-1] & mask64) == 0ull, "mi64_shlc: x[] has set bits beyond [nbits] position in high word!"); +// printf("mi64_shlc: %u bits, %u limbs, mask64 = %#" PRIX64 ", high limb = %#" PRIX64 "\n",nbits,len,mask64,x[len-1]); #ifndef __CUDA_ARCH__ /* Scratch array for storing off-shifted intermediate (need this to support in-place functionality): */ static uint64 *u = 0x0; @@ -431,10 +431,10 @@ void mi64_shlc(const uint64 x[], uint64 y[], uint32 nbits, uint32 nshift, uint32 if(dimU < 2*(nwmod+1)) { // GG: fixed bug in comparison dimU = 2*(nwmod+1); // Alloc 2x the immediately-needed to avoid excessive reallocs if needed size increases incrementally - u = (uint64 *)realloc(u, dimU*sizeof(uint64)); ASSERT(HERE, u != 0x0, "alloc failed!"); + u = (uint64 *)realloc(u, dimU*sizeof(uint64)); ASSERT(u != 0x0, "alloc failed!"); } #endif - ASSERT(HERE, nshift <= nbits, "mi64_shlc: shift count must be <= than bits in modulus!"); // This also ensures (nwshift < nwmod) + ASSERT(nshift <= nbits, "mi64_shlc: shift count must be <= than bits in modulus!"); // This also ensures (nwshift < nwmod) // Special-casing for 0 shift count, which includes the 1-full-rotation case nshift == nbits: if(!nshift || (nshift == nbits)) { if(x != y) mi64_set_eq(y, x, len); // Set y = x @@ -451,10 +451,10 @@ void mi64_shlc(const uint64 x[], uint64 y[], uint32 nbits, uint32 nshift, uint32 if(nwshift < len) cy = mi64_sub_scalar(y+nwshift,cy,y+nwshift,len-nwshift); // In Fermat-mod case, if high bits happen to = 0, must (mod Fm) by adding borrow = 1 back into low limb: - ASSERT(HERE, mi64_sub_scalar(y,cy,y,len) == 0ull, "Nonzero carryout of (mod Fm) low-limb incrementing!"); + ASSERT(mi64_sub_scalar(y,cy,y,len) == 0ull, "Nonzero carryout of (mod Fm) low-limb incrementing!"); } } else { - cy = mi64_add(y, u, y, nwshift); ASSERT(HERE, cy == 0ull, "Nonzero carryout of nonoverlapping vector add!"); + cy = mi64_add(y, u, y, nwshift); ASSERT(cy == 0ull, "Nonzero carryout of nonoverlapping vector add!"); } } @@ -493,8 +493,8 @@ uint32 mi64_shlc_bits_align(const uint64 x[], uint64 y[], uint32 nbits) uint32 len = (nbits+63)>>6, i,match = 0, curr_word,curr_bit,main_part,high_part,hi_word_bits = nbits&63; // W/o the extra "& (nbits&63)" this assumes nbits != 0, i.e. unsuitable for Fermats: uint64 mask64 = (-1ull << (nbits&63)) & (uint64)(nbits&63); - ASSERT(HERE, x && y && len, "mi64_shlc_bits_align: null input pointer or zero-length array!"); - ASSERT(HERE, (x[len-1] & mask64) == 0ull && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_align: x or y has set bits beyond [nbits] position in high word!"); + ASSERT(x && y && len, "mi64_shlc_bits_align: null input pointer or zero-length array!"); + ASSERT((x[len-1] & mask64) == 0ull && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_align: x or y has set bits beyond [nbits] position in high word!"); // Special-casing for in-place and 0-length case: if(!nbits || (x == y)) return 0; // Special-casing for single-word inputs: @@ -546,8 +546,8 @@ uint32 mi64_shlc_bits_limb0(const uint64 x0, const uint64 y[], uint32 nbits) uint32 len = (nbits+63)>>6, i, curr_word,curr_bit,main_part,high_part,hi_word_bits = nbits&63; // W/o the extra "& (nbits&63)" this assumes nbits != 0, i.e. unsuitable for Fermats: uint64 mask64 = (-1ull << (nbits&63)) & (uint64)(nbits&63); - ASSERT(HERE, y && len, "mi64_shlc_bits_limb0: null input pointer or zero-length array!"); - ASSERT(HERE, (nbits > 64 || (x0 & mask64) == 0ull) && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_limb0: x or y has set bits beyond [nbits] position in high word!"); + ASSERT(y && len, "mi64_shlc_bits_limb0: null input pointer or zero-length array!"); + ASSERT((nbits > 64 || (x0 & mask64) == 0ull) && (y[len-1] & mask64) == 0ull, "mi64_shlc_bits_limb0: x or y has set bits beyond [nbits] position in high word!"); // Special-casing for 0-length case: if(!nbits) return 0; // Special-casing for single-word inputs: @@ -601,7 +601,7 @@ uint64 mi64_shrl(const uint64 x[], uint64 y[], uint32 nshift, uint32 len, uint32 int i; uint32 nwshift = (nshift >> 6), rembits = (nshift & 63), m64bits; uint64 hi64 = 0ull; - ASSERT(HERE, len != 0, "mi64_shrl: zero-length array!"); + ASSERT(len != 0, "mi64_shrl: zero-length array!"); /* Ex 1: len = 1132 = 72448 bits, nshift = 70000, nwshift = 70000>>6 = 1093, rembits = 70000%64 = 48, m64bits = 64-rembits = 16 Thus we want the hi 2448 bits (38 full words + 16 bits) of x, and require output_len >= 39 . @@ -613,7 +613,7 @@ uint64 mi64_shrl(const uint64 x[], uint64 y[], uint32 nshift, uint32 len, uint32 But user has specified output_len = 16, meaning they want at most 1024 bits of x[], so only copy that many and exit. So allow output_len to be 1 limb smaller than 17 as a fudge factor to handle arbitrary in-word copy-bit boundaries: */ - ASSERT(HERE, output_len >= (len-nwshift)-1, "mi64_shrl: output_len must be large enough to hold result!"); + ASSERT(output_len >= (len-nwshift)-1, "mi64_shrl: output_len must be large enough to hold result!"); // Special-casing for 0 shift count: if(!nshift) { if(x != y) { @@ -677,7 +677,7 @@ uint64 mi64_shl_short_ref(const uint64 x[], uint64 y[], uint32 nshift, uint32 le int i; uint32 m64bits = (64-nshift); uint64 lo64 = 0ull; - ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); + ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); // Special-casing for 0 shift count: if(!nshift) { if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; } @@ -715,7 +715,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) int i, i0 = 0, i1 = 1, use_asm = FALSE, x_misalign = 0, y_misalign = 0; uint32 m64bits = (64-nshift), leftover = 0; uint64 lo64 = 0ull; - ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); + ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); // Special-casing for 0 shift count: if(!nshift) { if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; } @@ -770,7 +770,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) 3. if x,y have same 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, find i0 >= 1 such that x[i0] is SIMD-aligned. */ if( ((uintptr_t)x & 0x7) != 0 || ((uintptr_t)y & 0x7) != 0 ) - ASSERT(HERE, 0, "require 8-byte alignment of x,y!"); + ASSERT(0, "require 8-byte alignment of x,y!"); // In SIMD-ASM case, x_misalign = (0,1,2, or 3) how many words x[0] is above next-lower alignment boundary: x_misalign = ((uintptr_t)x & BASEADDRMASK)>>3; y_misalign = ((uintptr_t)y & BASEADDRMASK)>>3; @@ -808,7 +808,7 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) */ #if MI64_SHL1_DBG if(dbg) - printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = 0x%X,0x%X, base-addr for SHL macro = 0x%X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); + printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = %#X,%#X, base-addr for SHL macro = %#X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); #endif // Full-vector (except for x[0]) processing loop if no ASM; high-words cleanup-loop if ASM: for(i = len-1; i >= i1; i--) { @@ -1102,19 +1102,19 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) // Low-end clean-up loop (only used in ASM-loop case): for(i = i0-1; i > 0; i--) { #if MI64_SHL1_DBG - if(dbg) printf("Low-end clean-up loop: x[%u,%u] = 0x%16llX,0x%16llX; <<%u,>>%u = 0x%16llX,0x%16llX\n",i,i-1,x[i],x[i-1],nshift,m64bits,(x[i] << nshift),(x[i-1] >> m64bits)); + if(dbg) printf("Low-end clean-up loop: x[%u,%u] = %#16" PRIX64 ",%#16" PRIX64 "; <<%u,>>%u = %#16" PRIX64 ",%#16" PRIX64 "\n",i,i-1,x[i],x[i-1],nshift,m64bits,(x[i] << nshift),(x[i-1] >> m64bits)); #endif y[i] = (x[i] << nshift) + (x[i-1] >> m64bits); #if MI64_SHL1_DBG - if(dbg) printf(" ==> y[%u] = 0x%16llX\n",i,y[i]); + if(dbg) printf(" ==> y[%u] = %#16" PRIX64 "\n",i,y[i]); #endif } // Least-significant element gets zeros shifted in from the right: y[0] = (x[0] << nshift); #if MI64_SHL1_DBG if(len < 1000) { - if(lo64 != ref[len]) { printf("SHL1 Carryout mismatch: (y[%u] = %16llX) != (ref[%u] = %16llX)\n",len,lo64,len,ref[len]); ASSERT(HERE, 0, "Exiting!"); } - if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16llX) != (ref[%u] = %16llX)\n",i,y[i],i,ref[i]); printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, misalign = %u, use_asm = %u; x,y = 0x%X,0x%X, base-addr for SHL macro = 0x%X\n",nshift,len,i0,i1,leftover,x_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); ASSERT(HERE, 0, "Exiting!"); } } } + if(lo64 != ref[len]) { printf("SHL1 Carryout mismatch: (y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",len,lo64,len,ref[len]); ASSERT(0, "Exiting!"); } + if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",i,y[i],i,ref[i]); printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, misalign = %u, use_asm = %u; x,y = %#X,%#X, base-addr for SHL macro = %#X\n",nshift,len,i0,i1,leftover,x_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); ASSERT(0, "Exiting!"); } } } } #endif return lo64; @@ -1134,7 +1134,7 @@ uint64 mi64_shrl_short_ref(const uint64 x[], uint64 y[], uint32 nshift, uint32 l int i; uint32 m64bits = (64-nshift), leftover = 0; uint64 hi64 = 0ull; - ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); + ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); // Special-casing for 0 shift count: if(!nshift) { if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; } @@ -1169,7 +1169,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) int i, i0 = 0, i1 = 0, use_asm = FALSE, x_misalign, y_misalign; uint32 m64bits = (64-nshift), leftover = 0; uint64 hi64 = 0ull; - ASSERT(HERE, len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); + ASSERT(len != 0 && nshift < 64, "mi64_shl: zero-length array or shift count >= 64!"); // Special-casing for 0 shift count: if(!nshift) { if(x != y) for(i = 0; i < len; i++){ y[i] = x[i]; } @@ -1223,7 +1223,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) 3. if x,y have same 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, find i0 >= 0 such that x[i0] is SIMD-aligned. */ if( ((uintptr_t)x & 0x7) != 0 || ((uintptr_t)y & 0x7) != 0 ) - ASSERT(HERE, 0, "require 8-byte alignment of x,y!"); + ASSERT(0, "require 8-byte alignment of x,y!"); x_misalign = ((uintptr_t)x & BASEADDRMASK)>>3; y_misalign = ((uintptr_t)y & BASEADDRMASK)>>3; // minlen may have been incr. for alignment purposes, so use_asm not an unconditional TRUE here @@ -1253,7 +1253,7 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) #if MI64_SHR1_DBG if(dbg) - printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = 0x%X,0x%X, base-addr for SHRL macro = 0x%X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); + printf("nshift = %u: len,i0,i1,leftover = %u,%u,%u,%u, x,y_misalign = %u,%u, use_asm = %u; x,y = %#X,%#X, base-addr for SHRL macro = %#X\n",nshift,len,i0,i1,leftover,x_misalign,y_misalign,use_asm,(uint32)x,(uint32)y,(uint32)(x+i1-2)); #endif // Low-end cleanup-loop if ASM: for(i = 0; i < i0; i++) { @@ -1607,8 +1607,8 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len) #if MI64_SHR1_DBG if(len < 1000) { - if(hi64 != ref[len]) { printf("SHR1 Carryout mismatch: (y[%u] = %16llX) != (ref[%u] = %16llX)\n",len,hi64,len,ref[len]); ASSERT(HERE, 0, "Exiting!"); } - if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16llX) != (ref[%u] = %16llX)\n",i,y[i],i,ref[i]); ASSERT(HERE, 0, "Exiting!"); } } } + if(hi64 != ref[len]) { printf("SHR1 Carryout mismatch: (y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",len,hi64,len,ref[len]); ASSERT(0, "Exiting!"); } + if(!mi64_cmp_eq(y,ref,len)) { for(i = len-1; i >= 0; i--) { if(y[i] != ref[i]) { printf("(y[%u] = %16" PRIX64 ") != (ref[%u] = %16" PRIX64 ")\n",i,y[i],i,ref[i]); ASSERT(0, "Exiting!"); } } } } #endif return hi64; @@ -1623,7 +1623,7 @@ uint32 mi64_cmpult(const uint64 x[], const uint64 y[], uint32 len) { uint32 i; // Need hard-assert here due to zero-element default compare: - ASSERT(HERE, len != 0, "mi64_cmpult: zero-length array!"); + ASSERT(len != 0, "mi64_cmpult: zero-length array!"); for(i = len-1; i !=0 ; i--) /* Loop over all but the 0 elements while equality holds.... */ { if(x[i] < y[i]) { @@ -1643,7 +1643,7 @@ uint32 mi64_cmp_eq(const uint64 x[], const uint64 y[], uint32 len) uint32 i; // Allow for zero-length here with default return TRUE, // according to the convention that a zero-length mi64 object = 0: - ASSERT(HERE, len != 0, "mi64_cmp_eq: zero-length array!"); // allows us to catch zero-length cases in debug build & test + ASSERT(len != 0, "mi64_cmp_eq: zero-length array!"); // allows us to catch zero-length cases in debug build & test for(i = 0; i < len; i++) { if(x[i] != y[i]) return FALSE; @@ -1656,7 +1656,7 @@ __device__ #endif uint32 mi64_cmplt_scalar(const uint64 x[], uint64 a, uint32 len) { - ASSERT(HERE, len != 0, "zero-length array!"); + ASSERT(len != 0, "zero-length array!"); return ( (mi64_getlen(x, len) <= 1) && (x[0] < a) ); } @@ -1665,7 +1665,7 @@ __device__ #endif uint32 mi64_cmpgt_scalar(const uint64 x[], uint64 a, uint32 len) { - ASSERT(HERE, len != 0, "zero-length array!"); + ASSERT(len != 0, "zero-length array!"); return ( (x[0] > a) || (mi64_getlen(x, len) > 1) ); } @@ -1674,7 +1674,7 @@ __device__ #endif uint32 mi64_cmp_eq_scalar(const uint64 x[], uint64 a, uint32 len) { - ASSERT(HERE, len != 0, "mi64_cmp_eq_scalar: zero-length array!"); + ASSERT(len != 0, "mi64_cmp_eq_scalar: zero-length array!"); return ( (x[0] == a) && (mi64_getlen(x+1, len-1) == 0) ); } @@ -1745,7 +1745,7 @@ int mi64_ith_set_bit(const uint64 x[], uint32 bit, uint32 len) { int curr_pop,i,j,retval = 0; if(!len || !bit) return -1; - ASSERT(HERE, bit <= (len<<6), "[bit]th-bit specifier out of range!"); + ASSERT(bit <= (len<<6), "[bit]th-bit specifier out of range!"); // Find the word in which the [bit]th set-bit occurs: for(i = 0; i < len; i++) { curr_pop = popcount64(x[i]); @@ -1769,7 +1769,7 @@ __device__ uint32 mi64_trailz(const uint64 x[], uint32 len) { uint32 i, tz = 0; - ASSERT(HERE, len != 0, "mi64_trailz: zero-length array!"); + ASSERT(len != 0, "mi64_trailz: zero-length array!"); for(i = 0; i < len; i++, tz += 64) { if(x[i]) { return tz + trailz64(x[i]); @@ -1832,8 +1832,8 @@ __device__ #endif void mi64_md5(uint64 x[], uint32 len, uint64 md5[], char*const md5_str) { - ASSERT(HERE, x != 0x0, "mi64_md5: null input pointer!"); - ASSERT(HERE, md5_str != 0x0, "mi64_md5: null md5_str pointer!"); + ASSERT(x != 0x0, "mi64_md5: null input pointer!"); + ASSERT(md5_str != 0x0, "mi64_md5: null md5_str pointer!"); md5_str[0] = '\0'; // should be null on entry, but better safe than sorry uint32 i,j, lz = mi64_leadz(x,len); // lz = #leading 0-bits in x uint32 n = len<<6; // n = 64*len = #bits in the [len] words of x, including leading 0-bits @@ -1841,7 +1841,7 @@ void mi64_md5(uint64 x[], uint32 len, uint64 md5[], char*const md5_str) // Compute the working length [nword]: uint32 nblock = (nbit+576)>>9; // needed number of 512-bit data chucks: nblock = (nbit+576)/512 uint32 nword = nblock<<3; // nword = 8*#blocks: From here on will use that as the working length - ASSERT(HERE, len >= nword, "mi64_md5: input-vector lacks sufficient 0-padding!"); + ASSERT(len >= nword, "mi64_md5: input-vector lacks sufficient 0-padding!"); // Pre-processing: // 1. first a single bit, 1, is appended to the end of the message: mi64_set_bit(x,nbit,nword,1); // nword here is only used by mi64_set_bit() for bounds-checking @@ -1931,15 +1931,15 @@ uint32 mi64_extract_lead64(const uint64 x[], uint32 len, uint64*result) { uint32 i,nshift,nwshift,rembits; - ASSERT(HERE, len != 0, "mi64_extract_lead64: zero-length array!"); + ASSERT(len != 0, "mi64_extract_lead64: zero-length array!"); nshift = mi64_leadz(x, len); nwshift = (nshift >> 6); rembits = (nshift & 63); /* shift-word count may == len, but only if x[] = 0: */ if(nwshift >= len) { - ASSERT(HERE, nwshift == len, "mi64_extract_lead64: nwshift out of range!"); - ASSERT(HERE, mi64_iszero(x, len), "mi64_extract_lead64: expected zero-valued array!"); + ASSERT(nwshift == len, "mi64_extract_lead64: nwshift out of range!"); + ASSERT(mi64_iszero(x, len), "mi64_extract_lead64: expected zero-valued array!"); *result = 0ull; } else { i = len-1-nwshift; @@ -1964,19 +1964,19 @@ double mi64_cvt_double(const uint64 x[], uint32 len) if(lead64 == 0ull) { return 0.0; } - ASSERT(HERE,(lead64 >> 63) == 1ull, "mi64_cvt_double: lead64 lacks leftmost ones bit!"); + ASSERT((lead64 >> 63) == 1ull, "mi64_cvt_double: lead64 lacks leftmost ones bit!"); /* round based on 1st neglected bit: */ lead64_rnd = (lead64 >> 11) + ((lead64 >> 10) & 0x0000000000000001ull); /* exponent: */ itmp64 = (((uint64)0x3FD + (uint64)pow2) << 52); /* Add in mantissa, with hidden bit made explicit, hence the 0x3FD (rather than 0x3FE) initializer */ itmp64 += lead64_rnd; - ASSERT(HERE, itmp64 > lead64_rnd , "mi64_cvt_double: Exponent overflows IEEE64 field"); + ASSERT(itmp64 > lead64_rnd , "mi64_cvt_double: Exponent overflows IEEE64 field"); /* GCC bug: needed to add the explicit sign-check below, otherwise GCC 'optimizes' away the (*(double *)&itmp64): */ retval = *(double *)&itmp64; if(retval < 0.0) { - sprintf(cbuf, "rng_isaac_rand_double_norm_pos: lead64 = %16llx, itmp64 = %16llx, retval = %lf not in [0,1]!\n", lead64, itmp64, retval); - ASSERT(HERE, 0, cbuf); + sprintf(cbuf, "rng_isaac_rand_double_norm_pos: lead64 = %16" PRIx64 ", itmp64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", lead64, itmp64, retval); + ASSERT(0, cbuf); } return retval; } @@ -2002,8 +2002,8 @@ void mi64_extract_lead128(const uint64 x[], uint32 len, uint32 nshift, uint64 le { lead_x[0] = lead_x[1] = 0; - ASSERT(HERE, len != 0, "mi64_extract_lead128: zero-length array!"); - ASSERT(HERE, nshift < 64, "mi64_extract_lead128: illegal nshift value!"); + ASSERT(len != 0, "mi64_extract_lead128: zero-length array!"); + ASSERT(nshift < 64, "mi64_extract_lead128: illegal nshift value!"); /* Syntax reminder: MVBITS(from_integer,low_bit_of_from_integer,num_bits,to_integer,insert_bits_in_to_integer_starting_at_this_low_bit) @@ -2100,7 +2100,7 @@ uint64 mi64_add_ref(const uint64 x[], const uint64 y[], uint64 z[], uint32 len) { uint32 i; uint64 tmp, cy = 0; - ASSERT(HERE, len != 0, "mi64_add: zero-length array!"); + ASSERT(len != 0, "mi64_add: zero-length array!"); for(i = 0; i < len; i++) { tmp = x[i] + cy; @@ -2145,7 +2145,7 @@ uint64 mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len, // SdyBr: 6.60 uint32 i; uint64 tmp, cy = 0; - ASSERT(HERE, len != 0, "mi64_add: zero-length array!"); + ASSERT(len != 0, "mi64_add: zero-length array!"); for(i = 0; i < len; i++) { tmp = x[i] + cy; @@ -2250,7 +2250,7 @@ uint64 mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len, // uint32 i, odd = (len&1), len2 = len >> 1; uint64 tmp, cy = 0, c2 = 0; - ASSERT(HERE, has_sse42() != 0, "This ASM requires SSE4.2, which is unavailable on this CPU!"); + ASSERT(has_sse42() != 0, "This ASM requires SSE4.2, which is unavailable on this CPU!"); if(len2) { /* x86_64 ASM implementation of the add/carry loop: */ __asm__ volatile (\ @@ -2328,7 +2328,7 @@ uint64 mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len, // Jun 2016: bizarre ... GCC builds with opt > 0 on Haswell/Broadwell init this != 0 ... // making static not a reliable workaround, so try put cy = 0 init on separate line from declaration: uint64 cy; - cy = 0ull; ASSERT(HERE, cy == 0, "Init (cy = 0) fails!"); + cy = 0ull; ASSERT(cy == 0, "Init (cy = 0) fails!"); /* x86_64 ASM implementation of the add/carry loop: */ __asm__ volatile (\ "movq %[__x0],%%rax \n\t"/* &x[0] */\ @@ -2385,7 +2385,7 @@ uint64 mi64_add_cyin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len, uint32 i, lrem = (len&7), len8 = len >> 3; uint64 tmp, cy = 0, c2 = 0; #error mi64_add: no AVX512 support yet! - ASSERT(HERE, has_avx512() != 0, "This ASM requires AVX512, which is unavailable on this CPU!"); + ASSERT(has_avx512() != 0, "This ASM requires AVX512, which is unavailable on this CPU!"); vpcmpuq *** how to encode the base.offset data? *** vpgatherqq %%zmmM,%%zmmD[255] // zmmM has base_addr and @@ -2484,17 +2484,17 @@ uint64 mi64_sub(const uint64 x[], const uint64 y[], uint64 z[], uint32 len) uint32 i; uint64 tmp, tmp2, bw = 0; - ASSERT(HERE, len != 0, "mi64_sub: zero-length array!"); + ASSERT(len != 0, "mi64_sub: zero-length array!"); for(i = 0; i < len; i++) { tmp = x[i] - bw; bw = (tmp > x[i]); //bw = ((uint64)tmp > (uint64)x[i]); - ASSERT(HERE, bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!"); + ASSERT(bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!"); /* Need an extra temp here due to asymmetry of subtract: */ tmp2= tmp - y[i]; bw += (tmp2 > tmp); //bw += ((uint64)tmp2 > (uint64)tmp); - ASSERT(HERE, (tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!"); + ASSERT((tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!"); z[i] = tmp2; } return bw; @@ -2509,17 +2509,17 @@ uint64 mi64_sub_bwin(const uint64 x[], const uint64 y[], uint64 z[], uint32 len, uint32 i; uint64 tmp, tmp2, bw = bwin; - ASSERT(HERE, len != 0, "mi64_sub: zero-length array!"); + ASSERT(len != 0, "mi64_sub: zero-length array!"); for(i = 0; i < len; i++) { tmp = x[i] - bw; bw = (tmp > x[i]); //bw = ((uint64)tmp > (uint64)x[i]); - ASSERT(HERE, bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!"); + ASSERT(bw == ((uint64)tmp > (uint64)x[i]), "mi64_sub: compiler using signed compare (tmp > x[i])!"); /* Need an extra temp here due to asymmetry of subtract: */ tmp2= tmp - y[i]; bw += (tmp2 > tmp); //bw += ((uint64)tmp2 > (uint64)tmp); - ASSERT(HERE, (tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!"); + ASSERT((tmp2 > tmp) == ((uint64)tmp2 > (uint64)tmp), "mi64_sub: compiler using signed compare (tmp2 > tmp)!"); z[i] = tmp2; } return bw; @@ -2565,7 +2565,7 @@ uint64 mi64_add_scalar(const uint64 x[], uint64 a, uint64 y[], uint32 len) { uint32 i; uint64 cy = a; - ASSERT(HERE, x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!"); + ASSERT(x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!"); if(x == y) { /* In-place: Only need to proceed until carry peters out: */ for(i = 0; i < len; i++) { @@ -2595,7 +2595,7 @@ uint64 mi64_sub_scalar(const uint64 x[], uint64 a, uint64 y[], uint32 len) { uint32 i; uint64 bw = a, tmp; - ASSERT(HERE, x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!"); + ASSERT(x != 0x0 && y != 0x0 && len != 0, "mi64_add_scalar: null-pointer or zero-length array!"); if(x == y) { /* In-place: Only need to proceed until borrow peters out: */ for(i = 0; i < len; i++) { @@ -2672,7 +2672,7 @@ uint64 mi64_mul_scalar(const uint64 x[], uint64 a, uint64 y[], uint32 len) cy = hi + (y[i++] < lo); } // Cleanup loop for remaining terms: - ASSERT(HERE, len != 0, "zero-length array!"); + ASSERT(len != 0, "zero-length array!"); for(; i < len; i++) { #ifdef MUL_LOHI64_SUBROUTINE @@ -2712,12 +2712,12 @@ __device__ uint64 mi64_mul_scalar_add_vec2(const uint64 x[], uint64 a, const uint64 y[], uint64 z[], uint32 len) { uint64 cy; // Jul 2016: Same GCC bug as detailed in mi64_add - cy = 0ull; ASSERT(HERE, cy == 0, "Init (cy = 0) fails!"); + cy = 0ull; ASSERT(cy == 0, "Init (cy = 0) fails!"); #if MI64_MSAV2 uint64 *u = 0x0, *v = 0x0; uint64 c2; u = (uint64 *)calloc(len, sizeof(uint64)); v = (uint64 *)calloc(len, sizeof(uint64)); - ASSERT(HERE, u != 0x0 && v != 0x0, "calloc failed!"); + ASSERT(u != 0x0 && v != 0x0, "calloc failed!"); memcpy(v,y,(len<<3)); // Save copy of x[] c2 = mi64_mul_scalar(x, a, u, len); c2 += mi64_add(u, y, u, len); @@ -2751,7 +2751,7 @@ uint64 mi64_mul_scalar_add_vec2(const uint64 x[], uint64 a, const uint64 y[], ui cy += (z[i] < tmp); } // Cleanup loop for remaining terms: - ASSERT(HERE, len != 0, "zero-length array!"); + ASSERT(len != 0, "zero-length array!"); for(; i < len; i++) { #ifdef MUL_LOHI64_SUBROUTINE @@ -2850,10 +2850,10 @@ uint64 mi64_mul_scalar_add_vec2(const uint64 x[], uint64 a, const uint64 y[], ui if(!mi64_cmp_eq(u,z,len) || (cy != c2)) { for(i = 0; i < len; i++) { // if(u[i] != z[i]) - printf("i = %u Error: U = %20llu, Z = %20llu, Diff = %20lld\n",i,u[i],z[i],(int64)(u[i]-z[i]) ); + printf("i = %u Error: U = %20" PRIu64 ", Z = %20" PRIu64 ", Diff = %20" PRId64 "\n",i,u[i],z[i],(int64)(u[i]-z[i]) ); } - if(cy != c2) printf("Carry Error: c2 = %20llu, cy = %20llu, Diff = %20lld\n",c2,cy,(int64)(c2-cy) ); - ASSERT(HERE, 0, "mi64_add ASM result incorrect!"); + if(cy != c2) printf("Carry Error: c2 = %20" PRIu64 ", cy = %20" PRIu64 ", Diff = %20" PRId64 "\n",c2,cy,(int64)(c2-cy) ); + ASSERT(0, "mi64_add ASM result incorrect!"); } free((void *)u); u = 0x0; free((void *)v); v = 0x0; @@ -2897,12 +2897,12 @@ void mi64_mul_vector(const uint64 x[], uint32 lenX, const uint64 y[], uint32 len static uint64 *u = 0x0; static uint32 dimU = 0; #endif - ASSERT(HERE, x && y && z, "Null array x/y/z!"); - ASSERT(HERE, lenX != 0, "zero-length X-array!"); - ASSERT(HERE, lenY != 0, "zero-length Y-array!"); - ASSERT(HERE, x != z, "X and Z point to same array object!"); - ASSERT(HERE, y != z, "Y and Z point to same array object!"); - ASSERT(HERE, lenZ != 0x0, "Null lenZ pointer!"); + ASSERT(x && y && z, "Null array x/y/z!"); + ASSERT(lenX != 0, "zero-length X-array!"); + ASSERT(lenY != 0, "zero-length Y-array!"); + ASSERT(x != z, "X and Z point to same array object!"); + ASSERT(y != z, "Y and Z point to same array object!"); + ASSERT(lenZ != 0x0, "Null lenZ pointer!"); /* Init z[] = 0: */ for(i = 0; i < lenX + lenY; i++) { z[i] = 0; } @@ -2935,7 +2935,7 @@ void mi64_mul_vector(const uint64 x[], uint32 lenX, const uint64 y[], uint32 len if(dimU < 2*(lenA+1)) { // GG: fixed bug in comparison dimU = 2*(lenA+1); // Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally - u = (uint64 *)realloc(u, dimU*sizeof(uint64)); ASSERT(HERE, u != 0x0, "alloc failed!"); + u = (uint64 *)realloc(u, dimU*sizeof(uint64)); ASSERT(u != 0x0, "alloc failed!"); } #endif /* Loop over remaining (lenB-1) elements of B[], multiplying A by each, and @@ -2951,7 +2951,7 @@ void mi64_mul_vector(const uint64 x[], uint32 lenX, const uint64 y[], uint32 len more leading terms of the result is zero, caller can adjust vector length accordingly: */ *lenZ = mi64_getlen(z, *lenZ); - ASSERT(HERE, *lenZ <= lenA + lenB, "*lenZ > (lenA + lenB)!"); + ASSERT(*lenZ <= lenA + lenB, "*lenZ > (lenA + lenB)!"); } /* Squaring-specialized version of above. By way of example, consider a length-10 input vector and @@ -3054,11 +3054,11 @@ void mi64_sqr_vector(const uint64 x[], uint64 z[], uint32 len) if(dbg) printf("realloc to dimU = %u\n",dimU); #endif // Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally - u = (uint64 *)realloc(u, 4* len *sizeof(uint64)); ASSERT(HERE, u != 0x0, "alloc failed!"); + u = (uint64 *)realloc(u, 4* len *sizeof(uint64)); ASSERT(u != 0x0, "alloc failed!"); } #endif - ASSERT(HERE, z != x, "Input and output arrays must be distinct!"); - ASSERT(HERE, len != 0, "zero-length X-array!"); + ASSERT(z != x, "Input and output arrays must be distinct!"); + ASSERT(len != 0, "zero-length X-array!"); memset(z, 0ull,(len8<<1)); // Clear z[0,...,2*len-1] @@ -3067,7 +3067,7 @@ void mi64_sqr_vector(const uint64 x[], uint64 z[], uint32 len) z[len] = mi64_mul_scalar(x+1, x[0], z+1, len-1); #if MI64_SQR_DBG if(dbg) { - printf("x0*x[1...n-1] = %llu * %s...\n",x[0],&cbuf[convert_mi64_base10_char(cbuf,x+1,len-1,0)]); + printf("x0*x[1...n-1] = %" PRIu64 " * %s...\n",x[0],&cbuf[convert_mi64_base10_char(cbuf,x+1,len-1,0)]); printf(" ... -> z = %s...\n",&cbuf[convert_mi64_base10_char(cbuf,z,2*len,0)]); } #endif @@ -3077,7 +3077,7 @@ void mi64_sqr_vector(const uint64 x[], uint64 z[], uint32 len) z[len+j] = mi64_mul_scalar_add_vec2(x+i, x[j], z+i+j, z+i+j, len-i); #if MI64_SQR_DBG if(dbg) { - printf("x%u*x[%u...n-1] = %llu * %s...\n",j,i,x[j],&cbuf[convert_mi64_base10_char(cbuf,x+i,len-i,0)]); + printf("x%u*x[%u...n-1] = %" PRIu64 " * %s...\n",j,i,x[j],&cbuf[convert_mi64_base10_char(cbuf,x+i,len-i,0)]); printf(" ... += z = %s...\n",&cbuf[convert_mi64_base10_char(cbuf,z,2*len,0)]); } #endif @@ -3151,14 +3151,14 @@ void mi64_mul_vector_lo_half (const uint64 x[], const uint64 y[], uint64 z[], ui /* Scratch array for storing intermediate scalar*vector products: */ static uint64 *u = 0x0; static uint32 dimU = 0; - ASSERT(HERE, x && y && z, "Null array pointer!"); - ASSERT(HERE, len != 0, "zero-length X-array!"); + ASSERT(x && y && z, "Null array pointer!"); + ASSERT(len != 0, "zero-length X-array!"); // Does scratch array need allocating or reallocating? (Use realloc for both cases): if(dimU < 2*(len+1)) { // GG: fixed bug in comparison dimU = 2*(len+1); // Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally u = (uint64 *)realloc(u, 2*(len+1)*sizeof(uint64)); // NB: realloc leaves newly-alloc'ed size fraction uninited - ASSERT(HERE, u != 0x0, "alloc failed!"); + ASSERT(u != 0x0, "alloc failed!"); } memset(u, 0ull, (len<<4)); // Accumulator u[] needs to be cleared each time #endif @@ -3208,11 +3208,11 @@ void mi64_mul_vector_hi_half (const uint64 x[], const uint64 y[], uint64 z[], ui // Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally u = (uint64 *)realloc(u, 2*(len+1)*sizeof(uint64)); v = (uint64 *)realloc(v, 4* len *sizeof(uint64)); - ASSERT(HERE, u != 0x0 && v != 0x0, "alloc failed!"); + ASSERT(u != 0x0 && v != 0x0, "alloc failed!"); } memset(v, 0ull, (len<<4)); // Accumulator v[] needs to be cleared each time #endif - ASSERT(HERE, len != 0, "zero-length X-array!"); + ASSERT(len != 0, "zero-length X-array!"); /* Loop over the elements of y[], multiplying x[] by each, and using u[] as a scratch array to store x[]*y[j] prior to adding to z[]. @@ -3230,7 +3230,7 @@ void mi64_mul_vector_hi_half (const uint64 x[], const uint64 y[], uint64 z[], ui continue; u[len] = mi64_mul_scalar(x, y[j], u, len); #if MI64_MULHI_DBG - if(dbg) { printf("mi64_mul_vector_hi_half: j = %d, cy = %20llu, U = %s\n",j,u[len], &cbuf[convert_mi64_base10_char(cbuf, u, len+1, 0)]); } + if(dbg) { printf("mi64_mul_vector_hi_half: j = %d, cy = %20" PRIu64 ", U = %s\n",j,u[len], &cbuf[convert_mi64_base10_char(cbuf, u, len+1, 0)]); } #endif /* Add j-word-left-shifted u[] to v[]: */ /*** 11/2013: Simply could not get this to work using any opt-level > 0 under debian/gcc4.6 ***/ @@ -3240,7 +3240,7 @@ void mi64_mul_vector_hi_half (const uint64 x[], const uint64 y[], uint64 z[], ui if(dbg) { printf("mi64_mul_vector_hi_half: j = %d, V = %s\n",j, &cbuf[convert_mi64_base10_char(cbuf, v, len+j+1, 0)]); } if(dbg) { for(i=0;i<=len;++i) { - printf("v[%2d] = %20llu\n",i+j,v[i+j]); + printf("v[%2d] = %20" PRIu64 "\n",i+j,v[i+j]); } } #endif @@ -3350,14 +3350,14 @@ void mi64_mul_vector_hi_trunc(const uint64 x[], const uint64 y[], uint64 z[], ui uint64 tprod[2], cy; static uint64 *u = 0x0, *v = 0x0; // Scratch arrays for storing intermediate scalar*vector products static uint32 dimU = 0; - ASSERT(HERE, len != 0, "zero-length X-array!"); + ASSERT(len != 0, "zero-length X-array!"); // Does scratch array need allocating or reallocating? (Use realloc for both cases): if(dimU < 2*(len+1)) { // GG: fixed bug in comparison dimU = 2*(len+1); // Alloc 2x the immediately-needed to avoid excessive reallocs if neededsize increases incrementally u = (uint64 *)realloc(u, (len+1)<<4); // Realloc with 2*(len+1)*sizeof(uint64) bytes v = (uint64 *)realloc(v, len <<5); // Realloc with 4*(len )*sizeof(uint64) bytes - ASSERT(HERE, u != 0x0 && v != 0x0, "alloc failed!"); + ASSERT(u != 0x0 && v != 0x0, "alloc failed!"); } /* Compute desired row-sums by row index. For row j (j renamed 'idx' in function below): @@ -3401,7 +3401,7 @@ void mi64_mul_vector_hi_trunc(const uint64 x[], const uint64 y[], uint64 z[], ui // Test code for fast version of this function - re-use low half of v[] for output:: mi64_mul_vector_hi_half(x,y,v,len); if(!mi64_cmp_eq(v,v+len,len)) { - ASSERT(HERE,0,"mi64_mul_vector_hi_trunc result incorrect!"); + ASSERT(0,"mi64_mul_vector_hi_trunc result incorrect!"); } #endif /* Copy v[len:2*len-1] into z[0:len-1]: */ @@ -3458,23 +3458,23 @@ void mi64_mul_vector_hi_qmmp(const uint64 y[], const uint64 p, const uint64 k, u } u = (uint64 *)calloc(ldim, sizeof(uint64)); v = (uint64 *)calloc(ldim, sizeof(uint64)); - ASSERT(HERE, u != 0x0 && v != 0x0, "alloc failed!"); + ASSERT(u != 0x0 && v != 0x0, "alloc failed!"); } #endif //====need to finish 200-bit support! ======================= - ASSERT(HERE, z != y, "Input and output arrays must be distinct!"); - ASSERT(HERE, p < bits, "shift parameters out of range!"); - ASSERT(HERE, len != 0, "zero-length X-array!"); + ASSERT(z != y, "Input and output arrays must be distinct!"); + ASSERT(p < bits, "shift parameters out of range!"); + ASSERT(len != 0, "zero-length X-array!"); for(i = len+1; i < len2; i++) { u[i] = 0ull; // With proper padding of U don't need any zeroing of V prior to V = (U << p) step below } // memset(v, 0ull, (len<<4)); // No need to clear Accumulator v[] here due to dim = len2 in mi64_shl below - ASSERT(HERE, (k != 0) && ((k2>>1) == k), "2*k overflows!"); // Make sure 2*k did not overflow + ASSERT((k != 0) && ((k2>>1) == k), "2*k overflows!"); // Make sure 2*k did not overflow u[len] = mi64_mul_scalar(y,k2,u,len); // u[] stores Z = 2.k.Y mi64_shl(u,v,p,len2); // v[] stores (Z << p), store result in V u[len] -= mi64_sub(u,y,u,len); // (2k-1).Y = Z-Y, store result in U bw = mi64_sub(v,u,v,len+1); - ASSERT(HERE, !bw, "Unexpected borrow!"); + ASSERT(!bw, "Unexpected borrow!"); /* Right-shift by B bits to get UMULH(q,Y) = ((Z << p) - (2k-1).Y) >> B: */ mi64_shrl(v,v,bits,len2,len2); @@ -3491,13 +3491,13 @@ void mi64_mul_vector_hi_qmmp(const uint64 y[], const uint64 p, const uint64 k, u u[0] = 1; mi64_shl(u, u, p, len); // 2^p mi64_sub_scalar(u, 1, u, len); // M(p) = 2^p-1 - ASSERT(HERE, 0 == mi64_mul_scalar(u, k2, u, len), "2.k.M(p) overflows!"); // 2.k.M(p) + ASSERT(0 == mi64_mul_scalar(u, k2, u, len), "2.k.M(p) overflows!"); // 2.k.M(p) mi64_add_scalar(u, 1ull, u, len); // q = 2.k.M(p) + 1 // Test code for fast version of this function - re-use v[] for output:: // mi64_mul_vector_hi_half(u,y,v,len); mi64_mul_vector_hi_fast(y,p,k,v,len); if(!mi64_cmp_eq(v,z,len)) { - ASSERT(HERE, 0, "mi64_mul_vector_hi_qmmp/fast results differ!"); + ASSERT(0, "mi64_mul_vector_hi_qmmp/fast results differ!"); } #endif } @@ -3569,22 +3569,22 @@ void mi64_mul_vector_hi_fast(const uint64 y[], const uint64 p, const uint64 k, u uint32 i, bits; uint64 k2m1 = k-1+k, tmp,bw0,bw1,bw,cw,cy,cz; uint64 *zptr; - ASSERT(HERE, z != y, "Input and output arrays must be distinct!"); - ASSERT(HERE, (k != 0) && ((k2m1>>1) == k-1), "2*k-1 overflows!"); - ASSERT(HERE, len != 0, "zero-length X-array!"); + ASSERT(z != y, "Input and output arrays must be distinct!"); + ASSERT((k != 0) && ((k2m1>>1) == k-1), "2*k-1 overflows!"); + ASSERT(len != 0, "zero-length X-array!"); // 1. compute z' = (2k-1).y via vector-scalar mul, the carryout word cw = ((2k-1).Y >> B); cw = mi64_mul_scalar(y,k2m1,z,len); // z' = (2k-1).y bw0 = z[len-1]; -//if(k==900) printf("Mi64: bw0 = %20llu, cw = %20llu, z` = %s\n", bw0,cw,&s0[convert_mi64_base10_char(s0, z, len, 0)]); +//if(k==900) printf("Mi64: bw0 = %20" PRIu64 ", cw = %20" PRIu64 ", z` = %s\n", bw0,cw,&s0[convert_mi64_base10_char(s0, z, len, 0)]); // 2. compute low n words of z = z' + y via vector-vector add, any carryout of that gets added to a 2nd copy of cw, cz; cz = cw + mi64_add(y,z,z, len); // z = z' + y -//if(k==900) printf("Mi64: cz = %20llu, z = %s\n", cz,&s0[convert_mi64_base10_char(s0, z, len, 0)]); +//if(k==900) printf("Mi64: cz = %20" PRIu64 ", z = %s\n", cz,&s0[convert_mi64_base10_char(s0, z, len, 0)]); // 3. compute low n words of z >> (b-p), then separately shift in cz from the left, via (2^b*cz) >> (b-p) = (cz << p). - ASSERT(HERE, (len<<6) > p, "shift parameters out of range!"); + ASSERT((len<<6) > p, "shift parameters out of range!"); bw1 = mi64_shrl(z,z,(len<<6)-p,len,len); // low n words of z >> (b-p); high 64 bits of off-shifted portion saved in bw1 -//if(k==900) printf("Mi64: bw1 = %20llu, z>> = %s\n", bw1,&s0[convert_mi64_base10_char(s0, z, len, 0)]); +//if(k==900) printf("Mi64: bw1 = %20" PRIu64 ", z>> = %s\n", bw1,&s0[convert_mi64_base10_char(s0, z, len, 0)]); /* Check for borrow-on-subtract of to-be-off-shifted sections: have a borrow if z' (result from above mul_scalar, not including the carryout word cw) > ((z << p) % 2^b) (off-shifted portion of z = z' + y above, left-justified to fill a b-bit field) @@ -3600,18 +3600,18 @@ and tell the user to call the slow exact version of this function, currently ina zptr = z+i; // If (b-p) == 0 (mod 64) all of cz goes into z[i], with i = (b-p)/64; if(bits == 0) { - ASSERT(HERE, 0 == mi64_add_scalar(zptr,cz,zptr,len-i), "unexpected carryout of ( + cw)!"); + ASSERT(0 == mi64_add_scalar(zptr,cz,zptr,len-i), "unexpected carryout of ( + cw)!"); // Otherwise cz gets split between z[i] and z[i+1]: } else { // low 64-(p%64) bits of cz = (cz << bits) go into z[i]: tmp = (cz << bits); *zptr += tmp; cy = (*zptr++ < tmp); // high (p%64) bits of cw = (cw >> bits) go into z[i+1] - ASSERT(HERE, 0 == mi64_add_scalar(zptr,(cz >> (64-bits)) + cy,zptr,len-i-1), "unexpected carryout of ( + cw).hi!"); + ASSERT(0 == mi64_add_scalar(zptr,(cz >> (64-bits)) + cy,zptr,len-i-1), "unexpected carryout of ( + cw).hi!"); } // 4. subtract scalar (bw + cw) from resulting vector to effect ... - (2k-1).Y step in [*]. - ASSERT(HERE, 0 == mi64_sub_scalar(z,(bw + cw),z,len), "unexpected carryout of (... - cw) !"); + ASSERT(0 == mi64_sub_scalar(z,(bw + cw),z,len), "unexpected carryout of (... - cw) !"); } @@ -3658,12 +3658,12 @@ void mi64_mul_vector_hi_qferm(const uint64 y[], const uint64 p, const uint64 k, free((void *)u); u = 0x0; } u = (uint64 *)calloc(ldim, sizeof(uint64)); - ASSERT(HERE, u != 0x0, "alloc failed!"); + ASSERT(u != 0x0, "alloc failed!"); } #endif - ASSERT(HERE, z != y, "Input and output arrays must be distinct!"); - ASSERT(HERE, p < bits, "shift parameters out of range!"); - ASSERT(HERE, k != 0ull, "k must be nonzero!"); + ASSERT(z != y, "Input and output arrays must be distinct!"); + ASSERT(p < bits, "shift parameters out of range!"); + ASSERT(k != 0ull, "k must be nonzero!"); for(i = len+1; i < len2; i++) { u[i] = 0ull; // With proper padding of U don't need any zeroing of V prior to V = (U << p) step below } @@ -3672,7 +3672,7 @@ void mi64_mul_vector_hi_qferm(const uint64 y[], const uint64 p, const uint64 k, mi64_shl(u,u,(p+1),len2); // u[] stores (Z << p) cy = mi64_add(u,y,u,len); cy = mi64_add_scalar(u+len,cy,u+len, len2-len); - ASSERT(HERE, (cy == 0ull), "Unexpected carry!"); + ASSERT((cy == 0ull), "Unexpected carry!"); /* Right-shift by B bits to get UMULH(q,Y) = ((Z << p) - (2k-1).Y) >> B: */ mi64_shrl(u,u,bits,len2,len2); @@ -3705,14 +3705,14 @@ uint32 mi64_cvt_uint64_double(const uint64 x[], const uint64 y[], uint32 cy, uin int64 cyi, cyj, itmp, jtmp; uint64 curr_re64, curr_im64, bitsm1 = FFT_MUL_BITS-1, basem1 = FFT_MUL_BASE-1; - ASSERT(HERE, len != 0, "mi64_cvt_uint64_double: zero-length array!"); + ASSERT(len != 0, "mi64_cvt_uint64_double: zero-length array!"); /* Only constant base 2^16 is supported for this conversion at present: */ - ASSERT(HERE, FFT_MUL_BITS == 16, "mi64_cvt_uint64_double: FFT_MUL_BITS != 16"); + ASSERT(FFT_MUL_BITS == 16, "mi64_cvt_uint64_double: FFT_MUL_BITS != 16"); /* Redo the quicker checks of those done in util.c::check_nbits_in_types() */ - ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_uint64_double: FFT_MUL_BASE not pure-integer!"); - ASSERT(HERE, FFT_MUL_BASE < TWO54FLOAT, "mi64_cvt_uint64_double: FFT_MUL_BASE >= maximum allowed value of 2^54!"); + ASSERT(DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_uint64_double: FFT_MUL_BASE not pure-integer!"); + ASSERT(FFT_MUL_BASE < TWO54FLOAT, "mi64_cvt_uint64_double: FFT_MUL_BASE >= maximum allowed value of 2^54!"); /* As we extract each floating-point word, balance it and set resulting carry into next FP word: */ @@ -3740,16 +3740,16 @@ uint32 mi64_cvt_uint64_double(const uint64 x[], const uint64 y[], uint32 cy, uin a[jpad+1] = (double)(jtmp - (cyj<= 0!"); + ASSERT(a[jpad ] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!"); a[jpad ] += FFT_MUL_BASE; } if(cyj) { - ASSERT(HERE, a[jpad+1] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!"); + ASSERT(a[jpad+1] <= 0,"mi64_cvt_uint64_double: MS array element >= 0!"); a[jpad+1] += FFT_MUL_BASE; } printf("mi64_cvt_uint64_double: Final a[%u,%u] = %15.3f,%15.3f\n",jpad,jpad+1,a[jpad],a[jpad+1]); @@ -3790,18 +3790,18 @@ uint32 mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[] int64 cy_re, cy_im, itmp, jtmp; uint64 curr_re64, curr_im64; - ASSERT(HERE, n != 0, "zero-length array!"); + ASSERT(n != 0, "zero-length array!"); /* Redo the quicker checks of those done in util.c::check_nbits_in_types() */ - ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "FFT_MUL_BASE not pure-integer!"); - ASSERT(HERE, FFT_MUL_BASE < TWO54FLOAT, "FFT_MUL_BASE >= maximum allowed value of 2^54!"); + ASSERT(DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "FFT_MUL_BASE not pure-integer!"); + ASSERT(FFT_MUL_BASE < TWO54FLOAT, "FFT_MUL_BASE >= maximum allowed value of 2^54!"); /* Obsolete, for historical reference only: // Make sure MSW of Re(A[]) and Im(A[]) in the balanced-representation form are both >= 0: // Re(A[]) stored in even terms: for(i = 2*n-2; i >= 0; i-=2) { j = i + ( (i >> DAT_BITS) << PAD_BITS ); if(a[j] != 0.0) { - ASSERT(HERE, a[j] > 0.0, "MSW(Re(A[])) < 0!"); + ASSERT(a[j] > 0.0, "MSW(Re(A[])) < 0!"); break; } } @@ -3809,7 +3809,7 @@ uint32 mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[] for(i = 2*n-1; i >= 1; i-=2) { j = i + ( (i >> DAT_BITS) << PAD_BITS ); if(a[j] != 0.0) { - ASSERT(HERE, a[j] > 0.0, "MSW(Im(A[])) < 0!"); + ASSERT(a[j] > 0.0, "MSW(Im(A[])) < 0!"); break; } } @@ -3827,9 +3827,9 @@ uint32 mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[] j = i + ( (i >> DAT_BITS) << PAD_BITS ); - itmp = (uint64)1<= 0 && jtmp >= 0,"itmp,jtmp must be nonnegative 0!"); - ASSERT(HERE, (curr_re64>>curr_bits) == 0 && (curr_im64>>curr_bits) == 0,"(curr_wd64>>curr_bits) != 0!"); + ASSERT(itmp >= 0 && jtmp >= 0,"itmp,jtmp must be nonnegative 0!"); + ASSERT((curr_re64>>curr_bits) == 0 && (curr_im64>>curr_bits) == 0,"(curr_wd64>>curr_bits) != 0!"); /* Copy bits of the current residue word into the accumulator, starting at the (curr_bits)th bit. The resulting total number of accumulated bits @@ -3884,10 +3884,10 @@ uint32 mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[] y[len++] = curr_im64; nbits += curr_bits; } -// printf("mi64_cvt_double_uint64: Final a[%u,%u] = %15.3f,%15.3f; x,y[%u] = %llu,%llu\n",j,j+1,a[j],a[j+1],len-1,x[len-1],y[len-1]); - ASSERT(HERE, nbits == n*FFT_MUL_BITS,"nbits == n*FFT_MUL_BASE!"); - ASSERT(HERE, len == (n>>2) ,"len should == n/4!"); - ASSERT(HERE, ABS(cy_re) <= 1 && ABS(cy_im) <= 1,"Output carry out of range!"); +// printf("mi64_cvt_double_uint64: Final a[%u,%u] = %15.3f,%15.3f; x,y[%u] = %" PRIu64 ",%" PRIu64 "\n",j,j+1,a[j],a[j+1],len-1,x[len-1],y[len-1]); + ASSERT(nbits == n*FFT_MUL_BITS,"nbits == n*FFT_MUL_BASE!"); + ASSERT(len == (n>>2) ,"len should == n/4!"); + ASSERT(ABS(cy_re) <= 1 && ABS(cy_im) <= 1,"Output carry out of range!"); // Carries declared signed, but throw in casts of the 0 in the < compares to ensure signedness of these: return ( (cy_im < (int64)0)*8 + (cy_im != 0ull)*4 + (cy_re < (int64)0)*2 + (cy_re != 0ull) ); } @@ -3898,15 +3898,15 @@ uint32 mi64_cvt_double_uint64(const double a[], uint32 n, uint64 x[], uint64 y[] uint32 mi64_init_mers_or_ferm_modulus(uint64 exp, int modtype, uint64 mvec[]) { uint32 i,j; // j = uint64 vector length - ASSERT(HERE, mvec != 0x0, "Null output-vector pointer!"); + ASSERT(mvec != 0x0, "Null output-vector pointer!"); if(modtype == 0) { // Mersenne, 2^exp - 1 - ASSERT(HERE, isPRP64(exp), "Mersenne exponent must be prime!"); + ASSERT(isPRP64(exp), "Mersenne exponent must be prime!"); j = (exp+63)>>6; // Loop rather than call to mi64_set_eq_scalar here, since need to set all elts = -1: for(i = 0; i < j; i++) { mvec[i] = -1ull; } mvec[j-1] >>= 64-(exp&63); // Leading word needs >> to leave just low exp%64 bits set } else { // Fermat, 2^exp + 1 - ASSERT(HERE, exp < 64, "Max supported Fermat-number index = 63!"); + ASSERT(exp < 64, "Max supported Fermat-number index = 63!"); j = ((1ull << exp)+63)>>6; // j = uint64 vector length; init sans the leading '1' word, then increment prior to mi64_div mi64_clear(mvec,j); @@ -3928,7 +3928,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) { const uint32 max_dim = 4096; uint64 n[max_dim],result[max_dim]; - ASSERT(HERE, len <= max_dim, "mi64_pprimeF: Required array length exceeds dimensioned maximum!"); + ASSERT(len <= max_dim, "mi64_pprimeF: Required array length exceeds dimensioned maximum!"); mi64_set_eq(n, p, len); mi64_sub_scalar(n, 1ull, n, len); /* n = p - 1 */ mi64_scalar_modpow_lr(z, n, p, len, result); return mi64_cmp_eq_scalar(result,1ull, len); @@ -3962,7 +3962,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) #endif #if MI64_PRP_DBG if(dbg) { - printf("mi64_scalar_modpow_lr: %llu^%s (mod q = %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,q,len,0)]); + printf("mi64_scalar_modpow_lr: %" PRIu64 "^%s (mod q = %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,q,len,0)]); printf("Using Montgomery-multiply remaindering.\n"); } #endif @@ -3970,13 +3970,13 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) mi64_clear(c,len); c[0] = a; return; } - ASSERT(HERE, b != 0x0 && c != 0x0, "Null input- or output-array pointer!"); + ASSERT(b != 0x0 && c != 0x0, "Null input- or output-array pointer!"); // Working length = length of product of scalar powering-base and modulus vector; // must not assume [len] reflects number nonzero limbs, i.e. thee might be 0-pads at high end: - wlen = mi64_getlen(q, len); ASSERT(HERE, wlen > 0, "0-length modulus!"); + wlen = mi64_getlen(q, len); ASSERT(wlen > 0, "0-length modulus!"); // Increment working length if a*q overflows into the next-higher limb: i64 = mi64_mul_scalar(q,a,prod,wlen); wlen += (i64 != 0ull); - ASSERT(HERE, wlen <= max_dim, "mi64_modpow_lr: Required array length exceeds dimensioned maximum!"); + ASSERT(wlen <= max_dim, "mi64_modpow_lr: Required array length exceeds dimensioned maximum!"); // Init writable local array n[] = q[], including 0-pad at top if a*q overflows len limbs mi64_set_eq(n, q, len); if(i64) n[wlen-1] = 0ull; // Use carryo4t i64 rather than (wlen > len) here, since wlen may be < len wlen2 = wlen + wlen; @@ -3985,8 +3985,8 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) if(dbg) printf("Modulus q has %u limbs, a*q has %u limbs\n",mi64_getlen(q,len),wlen); #endif nbits = wlen << 6; log2_numbits = ceil(log(1.0*nbits)/log(2.0)); - ASSERT(HERE, IS_ODD(n[0]), "Modulus must be odd for Montgomery-mod-based LR binary powering!"); - if(len == 1) ASSERT(HERE, a < n[0], "Input base array must be properly normalized (mod q)!"); + ASSERT(IS_ODD(n[0]), "Modulus must be odd for Montgomery-mod-based LR binary powering!"); + if(len == 1) ASSERT(a < n[0], "Input base array must be properly normalized (mod q)!"); /* Find modular inverse (mod 2^nbits) of w in preparation for modular multiply. w must be odd for Montgomery-style modmul to work. @@ -4015,7 +4015,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) } // Check the computed inverse: mi64_mul_vector_lo_half(n, ninv, prod, wlen); - ASSERT(HERE, mi64_cmp_eq_scalar(prod, 1ull, wlen), "Bad Montmul inverse!"); + ASSERT(mi64_cmp_eq_scalar(prod, 1ull, wlen), "Bad Montmul inverse!"); #if MI64_PRP_DBG if(dbg) printf("qinv = %s\n", &cbuf[convert_mi64_base10_char(cbuf, ninv, wlen, 0)]); #endif @@ -4029,7 +4029,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) j = mi64_leadz(b,len); start_index = (len<<6) - j; #if MI64_PRP_DBG if(dbg) { - printf("base a[] = %llu, start_bit = %d\n",a,start_index-2); + printf("base a[] = %" PRIu64 ", start_bit = %d\n",a,start_index-2); printf("R*a (mod q) = %s\n",&cbuf[convert_mi64_base10_char(cbuf,c,wlen, 0)]); } #endif @@ -4046,9 +4046,9 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) // do bit-dependent mul-by-base here on the double-wide squaring output: #if !DO_N_MODSQUARES if(mi64_test_bit(b,j)) { - i64 = mi64_mul_scalar(prod, a, prod, wlen2); ASSERT(HERE, i64 == 0ull, "Unexpected carry out of a*x^2!"); + i64 = mi64_mul_scalar(prod, a, prod, wlen2); ASSERT(i64 == 0ull, "Unexpected carry out of a*x^2!"); #if MI64_PRP_DBG - if(dbg) printf("*= %llu = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]); + if(dbg) printf("*= %" PRIu64 " = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]); #endif } #endif // endif !DO_N_MODSQUARES @@ -4060,21 +4060,21 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) #endif // If hi < lo, then calculate (hi-lo)+q = q-lo+hi < q; otherwise calculate hi-lo: if(mi64_cmpult(hi,lo,wlen)) { - i64 = mi64_sub(hi,lo,lo,wlen); ASSERT(HERE, i64, "Expected a borrow!"); - i64 = mi64_add(n ,lo,c ,wlen); ASSERT(HERE, i64, "Expected borrow/carry cancellation!"); + i64 = mi64_sub(hi,lo,lo,wlen); ASSERT(i64, "Expected a borrow!"); + i64 = mi64_add(n ,lo,c ,wlen); ASSERT(i64, "Expected borrow/carry cancellation!"); } else { - i64 = mi64_sub(hi,lo,c ,wlen); ASSERT(HERE,!i64, "Unexpected borrow!"); + i64 = mi64_sub(hi,lo,c ,wlen); ASSERT(!i64, "Unexpected borrow!"); } #if MI64_PRP_DBG if(dbg) printf("(mod q) = %s\n", &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)]); - if(dbg && !(j & 1023)) printf("At bit %d: Res64 = %016llX\n",j,c[0]); + if(dbg && !(j & 1023)) printf("At bit %d: Res64 = %016" PRIX64 "\n",j,c[0]); #endif } // Do a final Montmul-by-1 to remove the excess *R (mod q); hi = 0 here simplifies things: mi64_mul_vector_lo_half( c,ninv,lo,wlen); mi64_mul_vector_hi_half(lo,n ,lo,wlen); // (hi-lo)+q = q-lo+hi = q-lo: - i64 = mi64_sub(n,lo, c,len); ASSERT(HERE,!i64, "Unxpected borrow!"); + i64 = mi64_sub(n,lo, c,len); ASSERT(!i64, "Unxpected borrow!"); #if MI64_PRP_DBG if(dbg) printf("retval = %s\n", &cbuf[convert_mi64_base10_char(cbuf, c, len, 0)]); #endif @@ -4098,25 +4098,25 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) #endif #if MI64_PRP_DBG if(dbg) { - printf("mi64_scalar_modpow_lr: %llu^%s (mod %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,b,len,0)]); + printf("mi64_scalar_modpow_lr: %" PRIu64 "^%s (mod %s)\n",a,&cbuf[convert_mi64_base10_char(cbuf,b,len,0)],&cstr[convert_mi64_base10_char(cstr,b,len,0)]); } #endif if(!a) { // a = 0; set result c[] = 0 and return mi64_clear(c,len); return; } - ASSERT(HERE, b != 0x0 && c != 0x0, "Null input- or output-array pointer!"); - ASSERT(HERE, len <= 1024, "mi64_modpow_lr: Max 1024 words allowed at present!"); + ASSERT(b != 0x0 && c != 0x0, "Null input- or output-array pointer!"); + ASSERT(len <= 1024, "mi64_modpow_lr: Max 1024 words allowed at present!"); mi64_set_eq_scalar(c, a, len); // Init result-holding array c[0] = a mi64_set_eq(npad , n, len); mi64_clear(npad+len, len); // set npad = a[]; npad is zero-padded // Working length = length of actual modulus vector: - wlen = mi64_getlen(n, len); ASSERT(HERE, wlen > 0, "0-length array!"); - ASSERT(HERE, mi64_cmpult(c, n, len), "Input base array must be properly normalized (mod n)!"); + wlen = mi64_getlen(n, len); ASSERT(wlen > 0, "0-length array!"); + ASSERT(mi64_cmpult(c, n, len), "Input base array must be properly normalized (mod n)!"); // LR modpow: j = leadz64(b[wlen-1]); start_index = (wlen<<6) - j; #if MI64_PRP_DBG if(dbg) { - printf("base a[] = %llu, start_bit = %d\n",a,start_index-1); + printf("base a[] = %" PRIu64 ", start_bit = %d\n",a,start_index-1); printf("x0 = %s, len = %u\n", &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)], wlen); } #endif @@ -4129,7 +4129,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) mi64_sqr_vector(c, prod, wlen); /* x^2 */ // mi64_div(prod, npad, len2, len2, 0x0, prod); *** Fails on F28 cofactor-PRP 3^nsquares (mod q) check; for x = 146715292687661855688^2 % q get 314605340220462438224, should = 240587464360836147143! *** mi64_div_binary(prod, npad, len2, len2, 0x0,&lenq, prod); - ASSERT(HERE, mi64_getlen(prod, len2) <= len, "mi64_modpow_lr: (x^2)%p illegal length"); + ASSERT(mi64_getlen(prod, len2) <= len, "mi64_modpow_lr: (x^2)%p illegal length"); mi64_set_eq(c, prod, len); /* c = (c^2)%p */ #if MI64_PRP_DBG if(dbg) printf("j = %d: x^2 (mod n) = %s\n", j, &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)]); @@ -4138,7 +4138,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) if(mi64_test_bit(b,j)) { prod[wlen] = mi64_mul_scalar(c, a, prod, wlen); #if MI64_PRP_DBG - if(dbg) printf("*= %llu = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]); + if(dbg) printf("*= %" PRIu64 " = %s\n", a, &cbuf[convert_mi64_base10_char(cbuf, prod, wlen+1, 0)]); #endif mi64_div_binary(prod, n, wlen+1, wlen, 0x0,0x0, c); // x = prod % p #if MI64_PRP_DBG @@ -4146,7 +4146,7 @@ uint32 mi64_pprimeF(const uint64 p[], uint64 z, uint32 len) #endif } #endif // endif !DO_N_MODSQUARES - if(!(j & 1023)) printf("At bit %d: Res64 = %016llX\n",j,c[0]); + if(!(j & 1023)) printf("At bit %d: Res64 = %016" PRIX64 "\n",j,c[0]); } #if MI64_PRP_DBG if(dbg) printf("retval = %s\n", &cbuf[convert_mi64_base10_char(cbuf, c, wlen, 0)]); @@ -4205,10 +4205,10 @@ void mi64_vcvtuqq2pd(const uint64 a[], double b[]) : "cc","memory","rax","rbx","rcx","xmm0","xmm1","xmm31" /* Clobbered registers */\ ); for(i = 0; i < 8; i++) { - ASSERT(HERE, b[i] == (double)a[i], "uint64->double conversion result differs from reference!"); + ASSERT(b[i] == (double)a[i], "uint64->double conversion result differs from reference!"); } #else - ASSERT(HERE, 0,"mi64_vcvtuqq2pd requires build with AVX512 instruction set!\n"); + ASSERT(0,"mi64_vcvtuqq2pd requires build with AVX512 instruction set!\n"); #endif // USE_AVX ? } @@ -4224,10 +4224,10 @@ void mi64_vcvtpd2uqq(const double a[], uint64 b[]) : "cc","memory","rax","rbx","rcx","xmm0","xmm1","xmm31" /* Clobbered registers */\ ); for(i = 0; i < 8; i++) { - ASSERT(HERE, (double)b[i] == a[i], "double->uint64 conversion result differs from reference!"); + ASSERT((double)b[i] == a[i], "double->uint64 conversion result differs from reference!"); } #else - ASSERT(HERE, 0,"mi64_vcvt2pduqq requires build with AVX512 instruction set!\n"); + ASSERT(0,"mi64_vcvt2pduqq requires build with AVX512 instruction set!\n"); #endif // USE_AVX ? } #endif // __CUDA_ARCH__ ? @@ -4686,14 +4686,14 @@ void mi64_modmul53_batch(const double a[], const double b[], const double m[], d : "cc","memory","rax","rbx","rcx","rdx" /* Clobbered registers */\ ); if(r[i] != rem64 && r[i] != im+rem64) { // Allow for rem to be either in [-m/2,+m/2] or in [0,m) - printf("[%2u/%2u]: %16llu * %16llu / %16llu = %16llu[quo], %16llu[rem], DP rem = %16.0f\n",i,ndata,ia,ib,im,quo64,rem64, r[i]); + printf("[%2u/%2u]: %16" PRIu64 " * %16" PRIu64 " / %16" PRIu64 " = %16" PRIu64 "[quo], %16" PRIu64 "[rem], DP rem = %16.0f\n",i,ndata,ia,ib,im,quo64,rem64, r[i]); if(++nerr > 1000) exit(0); } - // ASSERT(HERE, r[i] == rem64, "Modmul result differs from reference!"); + // ASSERT(r[i] == rem64, "Modmul result differs from reference!"); } #endif #else - ASSERT(HERE, 0,"mi64_modmul53_batch requires build with AVX2 instruction set!\n"); + ASSERT(0,"mi64_modmul53_batch requires build with AVX2 instruction set!\n"); #endif // USE_AVX ? } #endif // __CUDA_ARCH__ ? @@ -4717,7 +4717,7 @@ Test harness code: if(a > m) a %= m; if(b > m) b %= m; uint64 r = mi64_modmul64(a,b,m); - // printf("(%llu * %llu) mod %llu = %llu\n",a,b,m,r); + // printf("(%" PRIu64 " * %" PRIu64 ") mod %" PRIu64 " = %" PRIu64 "\n",a,b,m,r); } } */ @@ -4732,7 +4732,7 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m) if(first_entry) { unsigned short FPUCTRL; __asm__ volatile ("fstcw %0" : "=m" (FPUCTRL) ); - ASSERT(HERE, FPUCTRL == FPU_64CHOP, "This function requires user to set x87 FPU to truncatig-round mode!"); + ASSERT(FPUCTRL == FPU_64CHOP, "This function requires user to set x87 FPU to truncatig-round mode!"); first_entry = FALSE; } // x86_64 modmul code using 64-bit FDIV for quotient - 2 versions, first one for 63-bit inputs, needs ~36 cycles on Core2: @@ -4850,7 +4850,7 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m) // for which the true q = 13545154436197203258, but FDIV produces 13545154436197203256. // with this added check I got 10^11 sets of genuine 64-bit inputs to run sans errors: if(r >= m) { - // printf("a,b,m = %llu, %llu, %llu; FDIV-mod gives r = %llu\n",a,b,m,r); + // printf("a,b,m = %" PRIu64 ", %" PRIu64 ", %" PRIu64 "; FDIV-mod gives r = %" PRIu64 "\n",a,b,m,r); r -= m; } /* This was the code to normalize one of the inputs before I used the -= 2^63 trick, so you can see the usefulness of the latter: @@ -4872,15 +4872,15 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m) // only explicitly store i' = i%2^64, high bit handled implicitly: uint32 i; uint64 mulh,twoi,diff = -1ull,ip = -m; // Initial iterate = 2^65 - m = [1,-m] in base-2^64 twos-comp form. Init diff = UINT64_MAX for(i = 0; i < 10; i++) { - twoi = ip + ip; //ASSERT(HERE, twoi > ip , "Unexpected overflow in 2*ip computation!"); - mulh = twoi + __MULH64(ip,ip); ASSERT(HERE, mulh > twoi, "Unexpected overflow in mulh summation!"); + twoi = ip + ip; //ASSERT(twoi > ip , "Unexpected overflow in 2*ip computation!"); + mulh = twoi + __MULH64(ip,ip); ASSERT(mulh > twoi, "Unexpected overflow in mulh summation!"); mulh = __MULH64(m,mulh); #error*** Mar 2021: Hit above assert with inputs (a=2, b=2, m=5000099); commenting it out, the iteration fails to converge *** diff = ip - m - mulh; ip += diff; if(!diff) break; } - ASSERT(HERE, !diff, "Barrett-modmul scaled inverse computation failed to converge!"); + ASSERT(!diff, "Barrett-modmul scaled inverse computation failed to converge!"); uint64 lo,hi; #ifdef MUL_LOHI64_SUBROUTINE MUL_LOHI64(a,b,&lo,&hi); @@ -4958,7 +4958,7 @@ uint64 mi64_modmul64(const uint64 a, const uint64 b, const uint64 m) ,[__m] "g" (m) \ : "cc","memory","rax","rbx","rcx","rdx" /* Clobbered registers */\ ); - ASSERT(HERE, r == i64, "Modmul result differs from reference!"); + ASSERT(r == i64, "Modmul result differs from reference!"); #endif return r; @@ -4978,11 +4978,11 @@ int mi64_div(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, uint6 uint32 xlen, ylen, max_len; uint64 itmp64; // Only the quotient array is optional: - ASSERT(HERE, lenX && lenY, "illegal 0 dimension!"); - ASSERT(HERE, x && y, "At least one of X, Y is null!"); - ASSERT(HERE, x != y, "X and Y arrays overlap!"); - ASSERT(HERE, r != y, "Y and Rem arrays overlap!"); - ASSERT(HERE, q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!"); + ASSERT(lenX && lenY, "illegal 0 dimension!"); + ASSERT(x && y, "At least one of X, Y is null!"); + ASSERT(x != y, "X and Y arrays overlap!"); + ASSERT(r != y, "Y and Rem arrays overlap!"); + ASSERT(q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!"); /* Init Q = 0; don't do similarly for R since we allow X and R to point to same array: */ if(q && (q != x)) { @@ -4991,7 +4991,7 @@ int mi64_div(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, uint6 /* And now find the actual lengths of the divide operands and use those for the computation: */ xlen = mi64_getlen(x, lenX); ylen = mi64_getlen(y, lenY); - ASSERT(HERE, ylen != 0, "divide by 0!"); + ASSERT(ylen != 0, "divide by 0!"); // If x < y, no modding needed - copy x into remainder and set quotient = 0: max_len = MAX(xlen, ylen); @@ -5048,12 +5048,12 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, static uint64 *scratch = 0x0; // "base pointer" for local storage shared by all of the above subarrays static uint64 *hi = 0x0, *v = 0x0, *w = 0x0; // These are treated as vars (cost-offsets of the above ptrs), // hence non-static. *** MUST RE-INIT ON EACH ENTRY *** - ASSERT(HERE, lenX && lenY, "illegal 0 dimension!"); - ASSERT(HERE, (lenY > 1) || (y[0] > 0), "Divide by zero!"); - ASSERT(HERE, (x && y) && (x != y), "Bad x or y array!"); - ASSERT(HERE, (q == 0x0 || q != r), "Quotient and remainder arrays must not overlap!"); // q may be 0x0, but must not overlap r + ASSERT(lenX && lenY, "illegal 0 dimension!"); + ASSERT((lenY > 1) || (y[0] > 0), "Divide by zero!"); + ASSERT((x && y) && (x != y), "Bad x or y array!"); + ASSERT((q == 0x0 || q != r), "Quotient and remainder arrays must not overlap!"); // q may be 0x0, but must not overlap r // To-do: Change from a simple pointers-coincide to an actual arrays-overlap check. - lenD = mi64_getlen(y, lenY); ASSERT(HERE, lenD != 0, "0-length divisor!"); + lenD = mi64_getlen(y, lenY); ASSERT(lenD != 0, "0-length divisor!"); // Alloc of the repeated-div-associated statics handled separately from other local storage: if(modDdim < lenD) { @@ -5063,9 +5063,9 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, free((void *)mod_inv_save); mod_inv_save = 0x0; free((void *)basepow_save); basepow_save = 0x0; } - modulus_save = (uint64 *)calloc((modDdim), sizeof(uint64)); ASSERT(HERE, modulus_save != 0x0, "alloc fail!"); - mod_inv_save = (uint64 *)calloc((modDdim), sizeof(uint64)); ASSERT(HERE, mod_inv_save != 0x0, "alloc fail!"); - basepow_save = (uint64 *)calloc((modDdim), sizeof(uint64)); ASSERT(HERE, basepow_save != 0x0, "alloc fail!"); + modulus_save = (uint64 *)calloc((modDdim), sizeof(uint64)); ASSERT(modulus_save != 0x0, "alloc fail!"); + mod_inv_save = (uint64 *)calloc((modDdim), sizeof(uint64)); ASSERT(mod_inv_save != 0x0, "alloc fail!"); + basepow_save = (uint64 *)calloc((modDdim), sizeof(uint64)); ASSERT(basepow_save != 0x0, "alloc fail!"); mod_repeat = FALSE; } @@ -5078,10 +5078,10 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, // printf("x = %s\n", &str_10k[__convert_mi64_base10_char(str_10k, 10<<10, x, lenX, 0)]); printf("y = %s\n", &s0[convert_mi64_base10_char(s0, y, lenD, 0)]); // Leave length-check off this so if y too large for print we assert right here // Compute result using slow binary-div algo, use that as reference: - qref = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(HERE, qref != 0x0, "alloc fail!"); - rref = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(HERE, rref != 0x0, "alloc fail!"); - lo_dbg = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(HERE, lo_dbg != 0x0, "alloc fail!"); - hi_dbg = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(HERE, hi_dbg != 0x0, "alloc fail!"); + qref = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(qref != 0x0, "alloc fail!"); + rref = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(rref != 0x0, "alloc fail!"); + lo_dbg = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(lo_dbg != 0x0, "alloc fail!"); + hi_dbg = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(hi_dbg != 0x0, "alloc fail!"); mi64_set_eq(lo_dbg,x,lenX); mi64_set_eq(hi_dbg,y,lenY); mi64_div_binary(lo_dbg,hi_dbg,lenX,lenY,qref,&lenQ,rref); @@ -5095,7 +5095,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, if(vsave) { free((void *)vsave); vsave = 0x0; } - vsave = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(HERE, vsave != 0x0, "alloc fail!"); + vsave = (uint64 *)calloc((lenX), sizeof(uint64)); ASSERT(vsave != 0x0, "alloc fail!"); } if(lenD > lens) { lens = lenD; @@ -5103,7 +5103,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, free((void *)scratch); scratch = yinv = cy = tmp = itmp = lo = hi = w = rem_save = 0x0; } /* (re)Allocate the needed auxiliary storage: */ - scratch = (uint64 *)calloc((lenD*8), sizeof(uint64)); ASSERT(HERE, scratch != 0x0, "alloc fail!"); + scratch = (uint64 *)calloc((lenD*8), sizeof(uint64)); ASSERT(scratch != 0x0, "alloc fail!"); } // These ptrs just point to various disjoint length-lenD sections of the shared local-storage chunk; // since some of them are treated as vars, reset 'em all on each entry, as well as re-zeroing the whole memblock: @@ -5157,8 +5157,8 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, nc++; mi64_sub(r,y,r,lenX); ++itmp64; // Need to incr quotient by 1 to account for extra sub-y } } - ASSERT(HERE, nc < ncmax, "Unexpectedly large number of corrections needed for floating-double quotient!"); - ASSERT(HERE, mi64_cmpult(r, y, lenX), "Remainder should be < modulus!"); + ASSERT(nc < ncmax, "Unexpectedly large number of corrections needed for floating-double quotient!"); + ASSERT(mi64_cmpult(r, y, lenX), "Remainder should be < modulus!"); // At this point are done with x, so set low word of quotient array and clear rest: if(q) { mi64_clear(q, lenX); q[0] = itmp64; @@ -5208,7 +5208,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, } hi = lo + lenS; // *** lo:hi pointer pairs must be offset by amount reflecting #words in right-justified modulus! *** #if MI64_DIV_MONT - if(dbg)printf("mi64_div_mont: setting hi = lo + lenS = %llX\n",(uint64)hi); + if(dbg)printf("mi64_div_mont: setting hi = lo + lenS = %" PRIX64 "\n",(uint64)hi); #endif // If single-word odd-component divisor, use specialized single-word-divisor version: @@ -5270,10 +5270,10 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, Init yinv = 3*w ^ 2. This formula returns the correct bottom 5 bits of yinv, and we double the number of correct bits on each of the subsequent iterations. */ - ASSERT(HERE, (w[0] & (uint64)1) == 1, "modulus must be odd!"); + ASSERT((w[0] & (uint64)1) == 1, "modulus must be odd!"); ybits = lenS << 6; log2_numbits = ceil(log(1.0*ybits)/log(2.0)); - ASSERT(HERE, (w[0] & (uint64)1) == 1, "w must be odd!"); + ASSERT((w[0] & (uint64)1) == 1, "w must be odd!"); mi64_clear(yinv, lenS); yinv[0] = (w[0] + w[0] + w[0]) ^ (uint64)2; @@ -5301,7 +5301,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, for(j = 6; j < log2_numbits; j++, i <<= 1) { mi64_mul_vector_lo_half(w, yinv,tmp, lenS); mi64_nega (tmp,tmp, lenS); - bw = mi64_add_scalar(tmp, 2ull,tmp, lenS); ASSERT(HERE, !bw, ""); + bw = mi64_add_scalar(tmp, 2ull,tmp, lenS); ASSERT(!bw, ""); mi64_mul_vector_lo_half(yinv,tmp, yinv, lenS); } // Save inverse in case next call uses same modulus: @@ -5310,7 +5310,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, // Check the computed inverse: mi64_mul_vector_lo_half(w, yinv, tmp, lenS); - ASSERT(HERE, mi64_cmp_eq_scalar(tmp, 1ull, lenS), "Bad Montmul inverse!"); + ASSERT(mi64_cmp_eq_scalar(tmp, 1ull, lenS), "Bad Montmul inverse!"); #if MI64_DIV_MONT if(dbg)printf("yinv = %s\n", &s0[convert_mi64_base10_char(s0, yinv, lenS, 0)]); #endif @@ -5337,7 +5337,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, mi64_set_eq(itmp,tmp,lenS); // itmp = tmp } #if MI64_DIV_MONT - if(dbg)printf("v-cy = %s, bw = %llu\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw); + if(dbg)printf("v-cy = %s, bw = %" PRIu64 "\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw); #endif // Now do the Montgomery mod: cy = umulh( w, mull(tmp, yinv) ); @@ -5346,7 +5346,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, if(dbg)printf("MULL = %s\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)]); #endif // bw = 0 or 1, but may propagate all the way into high word: - ASSERT(HERE, 0ull == mi64_add_scalar(tmp,bw, tmp, lenS), "tmp += bw has carryout!"); + ASSERT(0ull == mi64_add_scalar(tmp,bw, tmp, lenS), "tmp += bw has carryout!"); // Do double-wide product. Fast-divisibility test needs just high half (stored in hi); low half (lo) useful to extract true-mod mi64_mul_vector(tmp,lenS,w,lenS,lo, (uint32*)&j); // lo:hi = MUL_LOHI(q, tmp) @@ -5359,7 +5359,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, #if MI64_DIV_MONT if(dbg)printf("itmp = %s\n", &s0[convert_mi64_base10_char(s0, itmp, lenS, 0)]); #endif - ASSERT(HERE, 0, "Low-half product check mismatch!"); + ASSERT(0, "Low-half product check mismatch!"); } } @@ -5423,7 +5423,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, // current power p after each halving step here to account for that: p = (p >> 1) + 1; } - ASSERT(HERE, j <= 32, "Need 64-bit bitstring!"); + ASSERT(j <= 32, "Need 64-bit bitstring!"); /* Now do the needed powering. We always start with p = 2 and M-square that to get p = 3: */ @@ -5441,7 +5441,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, MONT_SQR_N(itmp,lo,w,yinv,tmp,lenS); // printf("B^5 mod q = %s\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)]); } else { - ASSERT(HERE, 0,"Bad starting value for power p!"); + ASSERT(0,"Bad starting value for power p!"); } for(i = j-1; i >= 0; i--) { if(BIT_TEST(n,i)) { @@ -5494,13 +5494,13 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, // Now do the Montgomery mod: cy = umulh( y, mull(tmp, yinv) ); mi64_mul_vector_lo_half(tmp,yinv,tmp, lenS); // tmp = tmp*yinv + bw; #if MI64_DIV_MONT - if(dbg)printf("tmp*yinv = %s, bw = %llu\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw); + if(dbg)printf("tmp*yinv = %s, bw = %" PRIu64 "\n", &s0[convert_mi64_base10_char(s0, tmp, lenS, 0)], bw); #endif // Do double-wide product. Fast-divisibility test needs just high half (stored in hi); low half (lo) useful to extract true-mod mi64_mul_vector(tmp,lenS,w,lenS,lo, (uint32*)&j); // lo:hi = MUL_LOHI(q, tmp); cy is in hi half // (cy + bw); Since bw = 0 or 1, check that bw=1 does not propagate is (sum >= bw) in 1-limb form. // Apr 2022: in more-general multiword case, check that hi[] + bw does not yield a carryout: - itmp64 = mi64_add_scalar(hi,bw, hi, lenS); ASSERT(HERE, itmp64 == 0ull, "mi64_div_mont(): Unexpected carryout from (hi[] + bw) in quotient loop!"); + itmp64 = mi64_add_scalar(hi,bw, hi, lenS); ASSERT(itmp64 == 0ull, "mi64_div_mont(): Unexpected carryout from (hi[] + bw) in quotient loop!"); #if MI64_DIV_MONT if(dbg)printf(" lo = %s\n", &s0[convert_mi64_base10_char(s0, lo, lenS, 0)]); if(dbg)printf(" hi = %s\n", &s0[convert_mi64_base10_char(s0, hi, lenS, 0)]); @@ -5509,7 +5509,7 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, #if MI64_DIV_MONT printf("itmp = %s\n", &s0[convert_mi64_base10_char(s0, itmp, lenS, 0)]); #endif - ASSERT(HERE, 0, "Low-half product check mismatch!"); + ASSERT(0, "Low-half product check mismatch!"); } mi64_set_eq(q+i,tmp,lenS); // Equivalent to the y[i] = tmp step of the scalar routine } @@ -5518,11 +5518,11 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, if(j) { // Check cy = {v[i],v[i+1],...,v[lenX-1],0,...,0} if(!mi64_cmp_eq(hi,v+i,j)) { - ASSERT(HERE, mi64_cmp_eq(hi,v+i,j), "cy check!"); + ASSERT(mi64_cmp_eq(hi,v+i,j), "cy check!"); } mi64_clear(q+i,j); // Do after above check since v may == q for(i = j; i < lenS; i++) { - ASSERT(HERE, hi[i] == 0ull, "cy check!"); + ASSERT(hi[i] == 0ull, "cy check!"); } } #if MI64_DIV_MONT @@ -5558,12 +5558,12 @@ int mi64_div_mont(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY, if(!mi64_cmp_eq(rref,r,lenY)) { printf("rref = %s\n", &s0[convert_mi64_base10_char(s0, rref, lenD, 0)]); printf("rewm = %s\n", &s0[convert_mi64_base10_char(s0, r , lenD, 0)]); - ASSERT(HERE, 0, "bzzt!\n"); + ASSERT(0, "bzzt!\n"); } if(!mi64_cmp_eq(qref,q,lenX)) { printf("qref = %s\n", &s0[convert_mi64_base10_char(s0, qref, lenX, 0)]); printf("qewm = %s\n", &s0[convert_mi64_base10_char(s0, q , lenX, 0)]); - ASSERT(HERE, 0, "bzzt!\n"); + ASSERT(0, "bzzt!\n"); } free((void *)qref); qref = 0x0; @@ -5610,12 +5610,12 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY if(dbg) printf("mi64_div_binary: x = %s, y = %s\n",&s0[convert_mi64_base10_char(s0, x, lenX, 0)],&s1[convert_mi64_base10_char(s1, y, lenY, 0)]); #endif - ASSERT(HERE, lenX && lenY, "illegal 0 dimension!"); - ASSERT(HERE, x && y, "At least one of X, Y is null!"); - ASSERT(HERE, x != y, "X and Y arrays overlap!"); - ASSERT(HERE, r != y, "Y and Rem arrays overlap!"); - ASSERT(HERE, q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!"); - if(q) ASSERT(HERE, lenQ != 0x0, "If quotient requested, quotient-length pointer must be provided!"); + ASSERT(lenX && lenY, "illegal 0 dimension!"); + ASSERT(x && y, "At least one of X, Y is null!"); + ASSERT(x != y, "X and Y arrays overlap!"); + ASSERT(r != y, "Y and Rem arrays overlap!"); + ASSERT(q != x && q != y && (q == 0x0 || q != r), "Quotient array overlaps one of X, Y ,Rem!"); + if(q) ASSERT(lenQ != 0x0, "If quotient requested, quotient-length pointer must be provided!"); /* Init Q = 0; don't do similarly for R since we allow X and R to point to same array: Jan 2018: No! User may feed qvec only suficient in size to hold ACTUAL QUOTIENT, based on an estimate of the latter - I hit "EXC_BAD_ACCESS, Could not access memory" in a case with xlen = ylen = 2^20, qlen = 1, where I simply fed a @@ -5629,7 +5629,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY /* And now find the actual lengths of the divide operands and use those for the computation: */ xlen = mi64_getlen(x, lenX); ylen = mi64_getlen(y, lenY); - ASSERT(HERE, ylen != 0, "divide by 0!"); + ASSERT(ylen != 0, "divide by 0!"); // Allocate the needed auxiliary storage - the 2 yloc = ... / mi64_set_eq calls below copy (lenX + lenY) limbs into scratch, so alloc at least that much: if(lens < (lenX + lenY)) { @@ -5646,9 +5646,9 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY Setting said breakpoint is useless, can't see function context when hit. Instead try setting min-size = 1024 in lens = ... . ***/ #if 1 - scratch = (uint64 *)realloc(scratch, lens*sizeof(uint64)); ASSERT(HERE, scratch != 0x0, "alloc fail!"); + scratch = (uint64 *)realloc(scratch, lens*sizeof(uint64)); ASSERT(scratch != 0x0, "alloc fail!"); #else - tmp_ptr = (uint64 *)malloc(lens*sizeof(uint64)); ASSERT(HERE, tmp_ptr != 0x0, "alloc fail!"); + tmp_ptr = (uint64 *)malloc(lens*sizeof(uint64)); ASSERT(tmp_ptr != 0x0, "alloc fail!"); free(scratch); scratch = tmp_ptr; #endif } @@ -5674,7 +5674,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY lz_x = mi64_leadz(xloc, max_len); lz_y = mi64_leadz(yloc, max_len); nshift = lz_y - lz_x; - ASSERT(HERE, nshift >= 0, "nshift < 0"); + ASSERT(nshift >= 0, "nshift < 0"); i = (nshift+63)>>6; if(q) { mi64_clear(q, i); *lenQ = i; @@ -5686,9 +5686,9 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY if(dbg)printf("I = %3d: r = %s, yshift = %s\n", i,&s0[convert_mi64_base10_char(s0, xloc, max_len, 0)],&s1[convert_mi64_base10_char(s1, yloc, max_len, 0)]); #endif if(mi64_cmpuge(xloc, yloc, max_len)) { - ASSERT(HERE, xlen == max_len,"xlen != max_len"); + ASSERT(xlen == max_len,"xlen != max_len"); mi64_sub(xloc, yloc, xloc, max_len); /* r -= yshift */ - ASSERT(HERE, mi64_cmpult(xloc, yloc, max_len),"r >= yshift"); + ASSERT(mi64_cmpult(xloc, yloc, max_len),"r >= yshift"); xlen = mi64_getlen(xloc, max_len); if(q) { mi64_set_bit(q,i,*lenQ,1); @@ -5702,7 +5702,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY } // Remainder in xloc - do some sanity checks prior to copying into r[]: xlen = mi64_getlen(xloc, lenX); - ASSERT(HERE, xlen <= ylen && mi64_cmpugt(y,xloc,ylen), "Remainder should be < modulus!"); + ASSERT(xlen <= ylen && mi64_cmpugt(y,xloc,ylen), "Remainder should be < modulus!"); if(r != 0x0) { mi64_set_eq(r, xloc, ylen); if(x == r) // If x == r, zero the leading (lenX-lenR) limbs of r prior to return: @@ -5711,7 +5711,7 @@ int mi64_div_binary(const uint64 x[], const uint64 y[], uint32 lenX, uint32 lenY mi64_clear(r+ylen,lenY-ylen); } /* Final value of yloc is unchanged from its (unshifted) starting value == y */ - ASSERT(HERE, mi64_cmp_eq(yloc,y,ylen), "Final value of y-copy differs from original!"); + ASSERT(mi64_cmp_eq(yloc,y,ylen), "Final value of y-copy differs from original!"); #if MI64_DIV_DBG if(dbg) { if(q)printf("mi64_div_binary: quotient = %s\n",&s0[convert_mi64_base10_char(s0, q, lenX, 0)]); @@ -5739,7 +5739,7 @@ int mi64_is_div_by_scalar32(const uint32 x[], uint32 q, uint32 len) { uint32 i,j,nshift,dlen,qinv,tmp,cy; - ASSERT(HERE, q > 0, "mi64_is_div_by_scalar32: 0 modulus!"); + ASSERT(q > 0, "mi64_is_div_by_scalar32: 0 modulus!"); if(q == 1) return TRUE; if(len == 0) return TRUE; @@ -5775,7 +5775,7 @@ int mi64_is_div_by_scalar32p(const uint32 x[], uint32 q, uint32 qinv, uint32 le { uint32 i,dlen,tmp,cy; - ASSERT(HERE, qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!"); + ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!"); cy = (uint32)0; dlen = len+len; /* Since are processing a uint64 array cast to uint32[], double the #words parameter */ for(i = 0; i < dlen; ++i) { @@ -5808,7 +5808,7 @@ int mi64_is_div_by_scalar32p_x8( uint32 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7,cy0,cy1,cy2,cy3,cy4,cy5,cy6,cy7; cy0 = cy1 = cy2 = cy3 = cy4 = cy5 = cy6 = cy7 = (uint32)0; - ASSERT(HERE, qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!"); + ASSERT(qinv == qinv*((uint32)2 - q*qinv), "mi64_is_div_by_scalar32p: bad qinv!"); tmp0 = a[0] * qinv; tmp1 = b[0] * qinv; @@ -5894,7 +5894,7 @@ uint32 mi64_is_div_by_scalar32_x4(const uint32 x[], uint32 q0, uint32 q1, uint32 uint32 retval=0,dlen = len+len, qinv0,qinv1,qinv2,qinv3,tmp0,tmp1,tmp2,tmp3,cy0,cy1,cy2,cy3; uint32 xcur,trailx; - ASSERT(HERE, q0 && q1 && q2 && q3, "mi64_is_div_by_scalar32_x4: 0 modulus!"); + ASSERT(q0 && q1 && q2 && q3, "mi64_is_div_by_scalar32_x4: 0 modulus!"); if(q0 + q1 + q2 + q3 == 4) return TRUE; if(len == 0) return TRUE; @@ -5967,7 +5967,7 @@ uint32 mi64_is_div_by_scalar32_x8(const uint32 x[], uint32 q0, uint32 q1, uint32 uint32 retval=0,dlen = len+len, qinv0,qinv1,qinv2,qinv3,qinv4,qinv5,qinv6,qinv7,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7,cy0,cy1,cy2,cy3,cy4,cy5,cy6,cy7; uint32 xcur,trailx; - ASSERT(HERE, q0 && q1 && q2 && q3 && q4 && q5 && q6 && q7, "mi64_is_div_by_scalar32_x8: 0 modulus!"); + ASSERT(q0 && q1 && q2 && q3 && q4 && q5 && q6 && q7, "mi64_is_div_by_scalar32_x8: 0 modulus!"); if(q0 + q1 + q2 + q3 + q4 + q5 + q6 + q7 == 8) return TRUE; if(len == 0) return TRUE; @@ -6085,10 +6085,10 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) if(itmp64 > q) { // This check allows us to differentiate between incorrect upward-rounded and (rarer) downward-rounded cases: if(DNINT(fquo) == (double)rem64) { // Incorrect upward-rounded, e.g. fquo = 1084809392143.0001, exact = 1084809392142.999... - // printf("%sA: q = %llu < itmp64 = (int64)%lld, fquo = %20.4f, (double)rem64 = %20.4f\n",func,q,(int64)itmp64, fquo, (double)rem64); + // printf("%sA: q = %" PRIu64 " < itmp64 = (int64)%" PRId64 ", fquo = %20.4f, (double)rem64 = %20.4f\n",func,q,(int64)itmp64, fquo, (double)rem64); itmp64 += q; } else { // Incorrect downward-rounded, e.g. fquo = 7344640876302.9990, exact = 7344640876303.0000002... - // printf("%sB: q = %llu < itmp64 = (int64)%lld, fquo = %20.4f *** Bad Downward ***\n",func,q,(int64)itmp64, fquo); + // printf("%sB: q = %" PRIu64 " < itmp64 = (int64)%" PRId64 ", fquo = %20.4f *** Bad Downward ***\n",func,q,(int64)itmp64, fquo); itmp64 -= q; } } @@ -6134,7 +6134,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) // Floating-point computation of 2^96 % q not 100% reliable - this pure-int code is our safety net: if(itmp64 > q) { - printf("Error correction failed: itmp64 = (int64)%lld, q = %llu [lq(q) = %6.4f]\n",(int64)itmp64,q,log(q)/log(2)); + printf("Error correction failed: itmp64 = (int64)%" PRId64 ", q = %" PRIu64 " [lq(q) = %6.4f]\n",(int64)itmp64,q,log(q)/log(2)); // In such cases re-do using the slower but bulletproof pure-integer method. // Use mod-doublings to get 2^68 (mod q), followed by 3 MONT_SQR64: itmp64 = 0x8000000000000000ull % q; // 2^63 (mod q) @@ -6147,7 +6147,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) MONT_SQR64(itmp64,q,qinv,itmp64); // 2^(2*68-64) == 2^72 (mod q) MONT_SQR64(itmp64,q,qinv,itmp64); // 2^(2*72-64) == 2^80 (mod q) MONT_SQR64(itmp64,q,qinv,itmp64); // 2^(2*80-64) == 2^96 (mod q) - ASSERT(HERE, itmp64 < q, "Pure-integer computation of 2^96 mod q fails!"); + ASSERT(itmp64 < q, "Pure-integer computation of 2^96 mod q fails!"); } } else if(q >> 32) { // q in [2^32,2^48) @@ -6156,7 +6156,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) itmp64 = 0x1000000000000000ull % q; // 2^60 (mod q) MONT_MUL48(itmp64,itmp64,q,qinv,itmp64); // 2^(2*60-48) == 2^72 (mod q) MONT_MUL48(itmp64,itmp64,q,qinv,itmp64); // 2^(2*72-48) == 2^96 (mod q) - ASSERT(HERE, itmp64 < q, "Pure-integer computation of 2^96 mod q fails!"); + ASSERT(itmp64 < q, "Pure-integer computation of 2^96 mod q fails!"); } else { // q < 2^32 @@ -6169,7 +6169,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) itmp32 -= (-(q32 < itmp32) & q32); // If 2*itmp32 > q, subtract q // itmp32 = 2^64 (mod q) MONT_MUL32(itmp32,itmp32,q32,qinv32,itmp32); // 2^(2*64-32) == 2^96 (mod q) - ASSERT(HERE, itmp32 < q32, "Pure-integer computation of 2^96 mod q fails!"); + ASSERT(itmp32 < q32, "Pure-integer computation of 2^96 mod q fails!"); itmp64 = itmp32; // promote to 64-bit } @@ -6177,7 +6177,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) MONT_SQR64(itmp64,q,qinv,rem64); #if MI64_RAD_POW64_DBG - if(dbg)printf("B^2 mod q = %20llu\n",rem64); + if(dbg)printf("B^2 mod q = %20" PRIu64 "\n",rem64); #endif /* rem64 holds B^2 mod q - Now compute sequence of powers needed to obtain B^len mod q via Montgomery-muls: */ @@ -6200,7 +6200,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) } else if(p == 5) { MONT_SQR64(itmp64,q,qinv,rem64); } else { - ASSERT(HERE, 0,"Bad starting value for power p!"); + ASSERT(0,"Bad starting value for power p!"); } for(i = j-1; i >= 0; i--) { if(BIT_TEST(bmap,i)) { @@ -6213,7 +6213,7 @@ uint64 radix_power64(const uint64 q, const uint64 qinv, uint32 n) } } #if MI64_RAD_POW64_DBG - if(dbg && p > 2)printf("B^%u mod q = %20llu\n",n,rem64); + if(dbg && p > 2)printf("B^%u mod q = %20" PRIu64 "\n",n,rem64); #endif return rem64; } @@ -6229,7 +6229,7 @@ int mi64_is_div_by_scalar64(const uint64 x[], uint64 q, uint32 len) uint32 i,nshift; uint64 qinv,cy; - ASSERT(HERE, q > 0, "mi64_is_div_by_scalar64: 0 modulus!"); + ASSERT(q > 0, "mi64_is_div_by_scalar64: 0 modulus!"); if(q == 1) return TRUE; if(len == 0) return TRUE; @@ -6346,9 +6346,9 @@ int mi64_is_div_by_scalar64_x4(const uint64 x[], uint64 q0, uint64 q1, uint64 q2 uint32 nshift0,nshift1,nshift2,nshift3; uint64 qinv0,qinv1,qinv2,qinv3,cy0,cy1,cy2,cy3; - ASSERT(HERE, (len == 0), "0 length!"); + ASSERT((len == 0), "0 length!"); trailx = trailz64(x[0]); - ASSERT(HERE, trailx < 64, "0 low word!"); + ASSERT(trailx < 64, "0 low word!"); /* q must be odd for Montgomery-style modmul to work, so first shift off any low 0s: */ nshift0 = trailz64(q0); @@ -6360,8 +6360,8 @@ int mi64_is_div_by_scalar64_x4(const uint64 x[], uint64 q0, uint64 q1, uint64 q2 q1 >>= nshift1; q2 >>= nshift2; q3 >>= nshift3; - ASSERT(HERE, q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!"); - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); + ASSERT(q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); qinv0 = (q0+q0+q0) ^ (uint64)2; qinv1 = (q1+q1+q1) ^ (uint64)2; @@ -6444,7 +6444,7 @@ int mi64_is_div_by_scalar64_x4(const uint64 x[], uint64 q0, uint64 q1, uint64 q2 #endif #if MI64_ISDIV_X4_DBG - if(dbg)printf("4-way carryouts: cy0-3 = %20llu, %20llu, %20llu, %20llu\n",cy0,cy1,cy2,cy3); + if(dbg)printf("4-way carryouts: cy0-3 = %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 "\n",cy0,cy1,cy2,cy3); #endif retval += ((cy0 == 0) && (nshift0 <= trailx)); retval += ((cy1 == 0) && (nshift1 <= trailx)) << 1; @@ -6469,13 +6469,13 @@ int mi64_is_div_by_scalar64_u2(const uint64 x[], uint64 q, uint32 len) uint32 i,len2 = (len>>1),nshift; uint64 qinv,cy0,cy1,rpow; - ASSERT(HERE, q > 0, "mi64_is_div_by_scalar64: 0 modulus!"); + ASSERT(q > 0, "mi64_is_div_by_scalar64: 0 modulus!"); if(q == 1) return TRUE; if(len == 0) return TRUE; - ASSERT(HERE, (len&1) == 0, "odd length!"); + ASSERT((len&1) == 0, "odd length!"); /* q must be odd for Montgomery-style modmul to work, so first shift off any low 0s: */ nshift = trailz64(q); -ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!"); +ASSERT(!nshift, "2-way folded ISDIV requires odd q!"); if(nshift) { if(trailz64(x[0]) < nshift) return FALSE; q >>= nshift; @@ -6539,7 +6539,7 @@ ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!"); #endif #if MI64_ISDIV_U2_DBG - if(dbg)printf("Half-length carryouts: cy0 = %20llu, cy1 = %20llu\n",cy0,cy1); + if(dbg)printf("Half-length carryouts: cy0 = %20" PRIu64 ", cy1 = %20" PRIu64 "\n",cy0,cy1); #endif // Compute radix-power; add 1 since used high-MUL version of the scaled-remainder algo ( = Algorithm A in the paper) rpow = radix_power64(q,qinv,len2+1); @@ -6548,8 +6548,8 @@ ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!"); MONT_MUL64(cy1,rpow,q,qinv,cy1); // cy1*B^p (mod q) #if MI64_ISDIV_U2_DBG if(dbg) { - printf("s1 mod q) = %20llu\n",cy0); - printf("s2*B^p mod q) = %20llu\n",cy1); + printf("s1 mod q) = %20" PRIu64 "\n",cy0); + printf("s2*B^p mod q) = %20" PRIu64 "\n",cy1); } #endif // Sum the scaled partial remainders: @@ -6558,13 +6558,13 @@ ASSERT(HERE, !nshift, "2-way folded ISDIV requires odd q!"); // Negation (mod q) needed for Algo A scaled remainder if(cy0) cy0 = q-cy0 ; #if MI64_ISDIV_U2_DBG - if(dbg)printf("(s1 + s2*B^p) mod q = %20llu, q = %20llu\n",cy0,q); + if(dbg)printf("(s1 + s2*B^p) mod q = %20" PRIu64 ", q = %20" PRIu64 "\n",cy0,q); #endif // One more modmul of sum by same power of the base gives true remainder - may as well, since we already have B^p handy: MONT_MUL64(cy0,rpow,q,qinv,cy0); #if MI64_ISDIV_U2_DBG if(dbg) { - printf("True mod x mod q = %20llu\n",cy0); + printf("True mod x mod q = %20" PRIu64 "\n",cy0); exit(0); } #endif @@ -6588,13 +6588,13 @@ int mi64_is_div_by_scalar64_u4(const uint64 x[], uint64 q, uint32 len) uint32 i,len4 = (len>>2),nshift; uint64 qinv,cy0,cy1,cy2,cy3,rpow; - ASSERT(HERE, q > 0, "mi64_is_div_by_scalar64: 0 modulus!"); + ASSERT(q > 0, "mi64_is_div_by_scalar64: 0 modulus!"); if(q == 1) return TRUE; if(len == 0) return TRUE; - ASSERT(HERE, (len&3) == 0, "Length must be a multiple of 4!"); + ASSERT((len&3) == 0, "Length must be a multiple of 4!"); /* q must be odd for Montgomery-style modmul to work, so first shift off any low 0s: */ nshift = trailz64(q); -ASSERT(HERE, !nshift, "4-way folded ISDIV requires odd q!"); +ASSERT(!nshift, "4-way folded ISDIV requires odd q!"); if(nshift) { if(trailz64(x[0]) < nshift) return FALSE; q >>= nshift; @@ -6724,7 +6724,7 @@ ASSERT(HERE, !nshift, "4-way folded ISDIV requires odd q!"); #endif #if MI64_ISDIV_U4_DBG - if(dbg)printf("Half-length carryouts: cy0-3 = %20llu, %20llu, %20llu, %20llu\n",cy0,cy1,cy2,cy3); + if(dbg)printf("Half-length carryouts: cy0-3 = %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 "\n",cy0,cy1,cy2,cy3); #endif // Compute radix-power; add 1 since used high-MUL version of the scaled-remainder algo ( = Algorithm A in the paper) rpow = radix_power64(q,qinv,len4+1); @@ -6736,13 +6736,13 @@ ASSERT(HERE, !nshift, "4-way folded ISDIV requires odd q!"); // Negation (mod q) needed for Algo A scaled remainder if(cy0) cy0 = q-cy0 ; #if MI64_ISDIV_U4_DBG - if(dbg) printf("(sum0-3) mod q = %20llu, q = %20llu\n",cy0,q); + if(dbg) printf("(sum0-3) mod q = %20" PRIu64 ", q = %20" PRIu64 "\n",cy0,q); #endif // One more modmul of sum by same power of the base gives true remainder: MONT_MUL64(cy0,rpow,q,qinv,cy0); #if MI64_ISDIV_U4_DBG if(dbg) { - printf("True mod x mod q = %20llu\n",cy0); + printf("True mod x mod q = %20" PRIu64 "\n",cy0); exit(0); } #endif @@ -6768,13 +6768,13 @@ uint64 mi64_div_by_scalar64(const uint64 x[], uint64 q, uint32 len, uint64 y[]) uint64 qinv,tmp = 0,bw,cy,lo,rem64,rem_save = 0,itmp64,mask,*iptr; double fquo,fqinv; /* Debug: -printf("x[]/q, quotient q = %llu, base b = 2^64\n",q); +printf("x[]/q, quotient q = %" PRIu64 ", base b = 2^64\n",q); for(i = 0; i < len; i++) - printf("x[%u] = %20llu;\n",i,x[i]); + printf("x[%u] = %20" PRIu64 ";\n",i,x[i]); printf("\n"); */ - ASSERT(HERE, (x != 0) && (len != 0), "Null input array or length parameter!"); - ASSERT(HERE, q > 0, "0 modulus!"); + ASSERT((x != 0) && (len != 0), "Null input array or length parameter!"); + ASSERT(q > 0, "0 modulus!"); // Unit modulus needs special handling to return proper 0 remainder rather than 1: if(q == 1ull) { if(y) mi64_set_eq(y,x,len); @@ -6802,7 +6802,7 @@ printf("\n"); rem_save = x[0] & mask; // (Which we don`t do since x is read-only; thus we are forced into accounting tricks :) q >>= nshift; } - ASSERT(HERE, (q & (uint64)1) == 1, "q must be odd!"); + ASSERT((q & (uint64)1) == 1, "q must be odd!"); uint32 q32,qi32; q32 = q; qi32 = minv8[(q&0xff)>>1]; @@ -6814,12 +6814,12 @@ printf("\n"); if(dbg) { printf("%s: nshift = %u, Input vector: x = 0;\n",func,nshift,q); if(len > 100) { - printf("x[%u] = %20llu, ... x[0] = %20llu\n",len-1,x[len-1],x[0]); // Pari-debug inputs; For every i++, shift count += 64 + printf("x[%u] = %20" PRIu64 ", ... x[0] = %20" PRIu64 "\n",len-1,x[len-1],x[0]); // Pari-debug inputs; For every i++, shift count += 64 } else { - for(i = 0; i < len; i++) printf("i = %u; x+=%20llu<<(i<<6);\n",i,x[i]); // Pari-debug inputs; For every i++, shift count += 64 + for(i = 0; i < len; i++) printf("i = %u; x+=%20" PRIu64 "<<(i<<6);\n",i,x[i]); // Pari-debug inputs; For every i++, shift count += 64 printf("\n"); } - printf("q = %20llu; qinv = %20llu\n",q,qinv); + printf("q = %20" PRIu64 "; qinv = %20" PRIu64 "\n",q,qinv); } #endif @@ -6833,7 +6833,7 @@ printf("\n"); #if MI64_DIV_MONT64 bw = cy; // Save a copy of the borrow flag for debug-printing itmp64 = tmp + ((-cy)&q); // Expected value of low-half of MUL_LOHI - // if(dbg)printf("i = %4u, tmp*qinv = %20llu\n",i,tmp*qinv); + // if(dbg)printf("i = %4u, tmp*qinv = %20" PRIu64 "\n",i,tmp*qinv); #endif tmp = tmp*qinv + cy; // Do double-wide product. Fast-divisibility test needs just high half (stored in cy); low half (tmp) needed to extract true-mod @@ -6843,8 +6843,8 @@ printf("\n"); MUL_LOHI64(q, tmp, tmp, cy); #endif #if MI64_DIV_MONT64 - // if(dbg)printf("i = %4u, lo = %20llu, hi = %20llu, bw = %1u\n",i,tmp,cy,(uint32)bw); - ASSERT(HERE, itmp64 == tmp, "Low-half product check mismatch!"); + // if(dbg)printf("i = %4u, lo = %20" PRIu64 ", hi = %20" PRIu64 ", bw = %1u\n",i,tmp,cy,(uint32)bw); + ASSERT(itmp64 == tmp, "Low-half product check mismatch!"); #endif } } else { // Even modulus, with or without quotient computation, uses Algo B @@ -6871,8 +6871,8 @@ printf("\n"); MUL_LOHI64(q, tmp, tmp, cy); #endif #if MI64_DIV_MONT64 - // if(dbg)printf("i = %4u, lo = %20llu, hi = %20llu, bw = %1u\n",i,tmp,cy,(uint32)bw); - ASSERT(HERE, *iptr == tmp, "Low-half product check mismatch!"); + // if(dbg)printf("i = %4u, lo = %20" PRIu64 ", hi = %20" PRIu64 ", bw = %1u\n",i,tmp,cy,(uint32)bw); + ASSERT(*iptr == tmp, "Low-half product check mismatch!"); #endif } // Last element has no shift-in from next-higher term, so can compute just the low-half output term, sans explicit MULs: @@ -6881,7 +6881,7 @@ printf("\n"); cy = (cy > *iptr); tmp = tmp + ((-cy)&q); #if MI64_DIV_MONT64 - // if(dbg)printf("i = %4u, lo_out = %20llu\n",i,tmp); + // if(dbg)printf("i = %4u, lo_out = %20" PRIu64 "\n",i,tmp); #endif } @@ -6910,7 +6910,7 @@ printf("\n"); rem64 = rem64 - q*(uint64)fquo; } if(rem64 != tmp%q) { - fprintf(stderr,"WARNING: Bad floating-point mod in mi64_div_by_scalar64! x = %llu, q = %llu: exact remainder = %llu, FP gives %llu.\n",x[0],q,tmp%q,rem64); + fprintf(stderr,"WARNING: Bad floating-point mod in mi64_div_by_scalar64! x = %" PRIu64 ", q = %" PRIu64 ": exact remainder = %" PRIu64 ", FP gives %" PRIu64 ".\n",x[0],q,tmp%q,rem64); rem64 = tmp%q; // Replace FP-approximation result with exact } if(y) { @@ -6939,7 +6939,7 @@ printf("\n"); // current (partial) remainder and re-add the off-shifted part of the true remainder. rem64 = (rem64 << nshift) + rem_save; #if MI64_DIV_MONT64 - if(dbg)printf("True mod: x mod q = %20llu\n",rem64); + if(dbg)printf("True mod: x mod q = %20" PRIu64 "\n",rem64); #endif if(!y) // Only remainder needed @@ -6956,7 +6956,7 @@ printf("\n"); bw = 0; cy = rem64; for(i = 0; i < len; ++i) { #if MI64_DIV_MONT64 - // if(dbg && i%(len>>2) == 0)printf("bw = %1llu, cy%1u = %20llu\n",bw,i/(len>>2),cy); // Use to debug loop-folded implemntation + // if(dbg && i%(len>>2) == 0)printf("bw = %1" PRIu64 ", cy%1u = %20" PRIu64 "\n",bw,i/(len>>2),cy); // Use to debug loop-folded implemntation #endif tmp = x[i] - bw - cy; /* Since may be working in-place, need an extra temp here due to asymmetry of subtract: */ @@ -6971,8 +6971,8 @@ printf("\n"); MUL_LOHI64(q, tmp, lo, cy); #endif #if MI64_DIV_MONT64 - // if(dbg)printf("i = %4u, quot[i] = %20llu, lo1 = %20llu, lo2 = %20llu, hi = %20llu, bw = %1u\n",i,tmp,itmp64,lo,cy,(uint32)bw); - ASSERT(HERE, itmp64 == lo, "Low-half product check mismatch!"); + // if(dbg)printf("i = %4u, quot[i] = %20" PRIu64 ", lo1 = %20" PRIu64 ", lo2 = %20" PRIu64 ", hi = %20" PRIu64 ", bw = %1u\n",i,tmp,itmp64,lo,cy,(uint32)bw); + ASSERT(itmp64 == lo, "Low-half product check mismatch!"); #endif y[i] = tmp; } @@ -6990,20 +6990,20 @@ printf("\n"); MUL_LOHI64(q, tmp, lo, cy); #endif #if MI64_DIV_MONT64 - // if(dbg)printf("i = %4u, quot[i] = %20llu\n",i,tmp); + // if(dbg)printf("i = %4u, quot[i] = %20" PRIu64 "\n",i,tmp); #endif y[i] = tmp; } } - ASSERT(HERE, bw == 0 && cy == 0, "bw/cy check!"); + ASSERT(bw == 0 && cy == 0, "bw/cy check!"); #if MI64_DIV_MONT64 if(dbg) { - printf("len = %u, q = %llu, nshift = %u, rem = %llu\n",len,q,nshift,rem64); + printf("len = %u, q = %" PRIu64 ", nshift = %u, rem = %" PRIu64 "\n",len,q,nshift,rem64); if(len > 100) { - printf("Quotient y[%u] = %20llu, y[%u] = %20llu, ... y[0] = %20llu\n",len-1,y[len-1],len-2,y[len-2],y[0]); // Pari-debug inputs; For every i++, shift count += 64 + printf("Quotient y[%u] = %20" PRIu64 ", y[%u] = %20" PRIu64 ", ... y[0] = %20" PRIu64 "\n",len-1,y[len-1],len-2,y[len-2],y[0]); // Pari-debug inputs; For every i++, shift count += 64 } else { printf("Quotient y = 0;\n"); - for(i = 0; i < len; i++) printf("i = %u; y+=%20llu<<(i<<6);\n",i,y[i]); // Pari-debug inputs; For every i++, shift count += 64 + for(i = 0; i < len; i++) printf("i = %u; y+=%20" PRIu64 "<<(i<<6);\n",i,y[i]); // Pari-debug inputs; For every i++, shift count += 64 printf("\n"); } } @@ -7026,8 +7026,8 @@ uint64 mi64_div_by_scalar64_u2(uint64 x[], uint64 q, uint32 lenu, uint64 y[]) // #endif int i,j,npad = (lenu&1),len = lenu + npad,len2 = (len>>1),nshift,lshift = -1; // Pad to even length uint64 qinv,cy0,cy1,rpow,rem_save = 0,xsave,itmp64,mask,*iptr0,*iptr1,ptr_incr; - ASSERT(HERE, (x != 0) && (len != 0), "Null input array or length parameter!"); - ASSERT(HERE, q > 0, "0 modulus!"); + ASSERT((x != 0) && (len != 0), "Null input array or length parameter!"); + ASSERT(q > 0, "0 modulus!"); // Unit modulus needs special handling to return proper 0 remainder rather than 1: if(q == 1ull) { if(y) mi64_set_eq(y,x,len); @@ -7204,7 +7204,7 @@ See similar behavior for 4-way-split version of the algorithm. #endif #if MI64_DIV_MONT64_U2 - if(dbg)printf("Half-length carryouts: cy0 = %20llu, cy1 = %20llu\n",cy0,cy1); + if(dbg)printf("Half-length carryouts: cy0 = %20" PRIu64 ", cy1 = %20" PRIu64 "\n",cy0,cy1); #endif if(!nshift) { // Odd modulus uses Algo A @@ -7224,7 +7224,7 @@ See similar behavior for 4-way-split version of the algorithm. MONT_MUL64(cy0,rpow,q,qinv,cy0); #if MI64_DIV_MONT64_U2 - if(dbg) printf("True mod %c = %20llu\n",'A'+(nshift != 0),cy0); + if(dbg) printf("True mod %c = %20" PRIu64 "\n",'A'+(nshift != 0),cy0); #endif // If we applied an initial right-justify shift to the modulus, restore the shift to the @@ -7255,7 +7255,7 @@ See similar behavior for 4-way-split version of the algorithm. MULH64(q,tmp0, cy0); MULH64(q,tmp1, cy1); #endif #if MI64_DIV_MONT64_U2 - if(dbg)printf("quot[%2u] = %20llu, quot[%2u] = %20llu, bw0,1 = %1u,%1u, cy0,1 = %20llu,%20llu\n",i,tmp0,i+len2,tmp1,(uint32)bw0,(uint32)bw1,cy0,cy1); + if(dbg)printf("quot[%2u] = %20" PRIu64 ", quot[%2u] = %20" PRIu64 ", bw0,1 = %1u,%1u, cy0,1 = %20" PRIu64 ",%20" PRIu64 "\n",i,tmp0,i+len2,tmp1,(uint32)bw0,(uint32)bw1,cy0,cy1); #endif // Write quotient word(s): y[i] = tmp0; y[i+len2] = tmp1; @@ -7339,7 +7339,7 @@ See similar behavior for 4-way-split version of the algorithm. #endif - ASSERT(HERE, cy1 == 0, "cy check!"); // all but the uppermost carryout are generally nonzero + ASSERT(cy1 == 0, "cy check!"); // all but the uppermost carryout are generally nonzero x[lenu] = xsave; // Restore input value of zero-padding one-beyond element x[lenu] prior to return return rpow; } @@ -7375,15 +7375,15 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[]) static uint64 *svec = 0x0; // svec = "scratch vector" if(first_entry) { first_entry = FALSE; - svec = (uint64 *)calloc(len_save, sizeof(uint64)); ASSERT(HERE, svec != 0x0, "alloc failed!"); + svec = (uint64 *)calloc(len_save, sizeof(uint64)); ASSERT(svec != 0x0, "alloc failed!"); } if(len > len_save) { len_save = len<<1; - svec = (uint64 *)realloc(svec, len_save*sizeof(uint64)); ASSERT(HERE, svec != 0x0, "alloc failed!"); + svec = (uint64 *)realloc(svec, len_save*sizeof(uint64)); ASSERT(svec != 0x0, "alloc failed!"); } - ASSERT(HERE, (x != 0) && (len != 0), "Null input array or length parameter!"); - ASSERT(HERE, q > 0, "0 modulus!"); + ASSERT((x != 0) && (len != 0), "Null input array or length parameter!"); + ASSERT(q > 0, "0 modulus!"); // Unit modulus needs special handling to return proper 0 remainder rather than 1: if(q == 1ull) { if(y) mi64_set_eq(y,x,len); @@ -7798,7 +7798,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[]) #endif #if MI64_DIV_MONT64_U4 - if(dbg)printf("Half-length carryouts: cy0-3 = %20llu, %20llu, %20llu, %20llu\n",cy0,cy1,cy2,cy3); + if(dbg)printf("Half-length carryouts: cy0-3 = %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 ", %20" PRIu64 "\n",cy0,cy1,cy2,cy3); #endif #ifdef USE_AVX2 @@ -7830,7 +7830,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[]) MONT_MUL64(cy0,rpow,q,qinv,cy0); #if MI64_DIV_MONT64_U4 - if(dbg) printf("True mod %c = %20llu\n",'A'+(nshift != 0),cy0); + if(dbg) printf("True mod %c = %20" PRIu64 "\n",'A'+(nshift != 0),cy0); #endif // If we applied an initial right-justify shift to the modulus, restore the shift to the @@ -7862,7 +7862,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[]) MULH64(q,tmp0, cy0); MULH64(q,tmp1, cy1); MULH64(q,tmp2, cy2); MULH64(q,tmp3, cy3); #endif #if MI64_DIV_MONT64_U4 - if(dbg)printf("quot[%2u,%2u,%2u,%2u] = %20llu,%20llu,%20llu,%20llu, bw0-3 = %1u,%1u,%1u,%1u, cy0-3 = %20llu,%20llu,%20llu,%20llu\n",i0,i1,i2,i3,tmp0,tmp1,tmp2,tmp3,(uint32)bw0,(uint32)bw1,(uint32)bw2,(uint32)bw3,cy0,cy1,cy2,cy3); + if(dbg)printf("quot[%2u,%2u,%2u,%2u] = %20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 ", bw0-3 = %1u,%1u,%1u,%1u, cy0-3 = %20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 ",%20" PRIu64 "\n",i0,i1,i2,i3,tmp0,tmp1,tmp2,tmp3,(uint32)bw0,(uint32)bw1,(uint32)bw2,(uint32)bw3,cy0,cy1,cy2,cy3); #endif // Write quotient words: y[i] = tmp0; y[i+len4] = tmp1; y[i+len2] = tmp2; y[i+len2+len4] = tmp3; @@ -7992,7 +7992,7 @@ uint64 mi64_div_by_scalar64_u4(uint64 x[], uint64 q, uint32 lenu, uint64 y[]) #endif // AVX2/MULX or not? #endif - ASSERT(HERE, cy3 == 0, "cy check!"); // all but the uppermost carryout are generally nonzero + ASSERT(cy3 == 0, "cy check!"); // all but the uppermost carryout are generally nonzero // Restore input values of 0-pad elements prior to return: for(i = 0; i < npad; i++) { x[lenu+i] = pads[i]; @@ -8053,7 +8053,7 @@ uint32 mi64_div_y32(uint64 x[], uint32 y, uint64 q[], uint32 len) rem = tsum%y; } if(rem == 0 && x != q) { // If overwrote input with quotient in above loop, skip this - ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)x, y, len), "Results of mi64_div_y32 and mi64_is_div_by_scalar32 differ!"); + ASSERT(mi64_is_div_by_scalar32((uint32 *)x, y, len), "Results of mi64_div_y32 and mi64_is_div_by_scalar32 differ!"); return 0; } return (uint32)rem; @@ -8083,7 +8083,7 @@ int __convert_mi64_base10_char(char char_buf[], uint32 n_alloc_chars, const uint double dtmp = 0.0; static uint64 *temp = 0x0; static uint32 tlen = 0; // #64-bit slots in current memalloc for *temp - ASSERT(HERE, fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!"); // Make sure these scaling powers have been inited + ASSERT(fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!"); // Make sure these scaling powers have been inited /* Estimate # of decimal digits: */ curr_len = mi64_getlen(x, len); /* this checks that len > 0; need at least one digit, even if it = 0. curr_len guaranteed > 0. */ @@ -8092,7 +8092,7 @@ int __convert_mi64_base10_char(char char_buf[], uint32 n_alloc_chars, const uint if(temp) { free((void *)temp); temp = 0x0; } - temp = (uint64 *)calloc(curr_len, sizeof(uint64)); ASSERT(HERE, temp != 0x0, "alloc failed!"); + temp = (uint64 *)calloc(curr_len, sizeof(uint64)); ASSERT(temp != 0x0, "alloc failed!"); tlen = curr_len; } mi64_set_eq(temp, x, curr_len); @@ -8100,7 +8100,7 @@ int __convert_mi64_base10_char(char char_buf[], uint32 n_alloc_chars, const uint if(curr_len > 1) dtmp = x[curr_len-2]*TWO64FLINV; MAX_DIGITS = ceil( (curr_len-1)*log10_base + log((double)x[curr_len-1] + dtmp)/ln10 ); MAX_DIGITS = MAX(MAX_DIGITS, 1); - ASSERT(HERE, MAX_DIGITS < n_alloc_chars, "Output string overflows buffer"); + ASSERT(MAX_DIGITS < n_alloc_chars, "Output string overflows buffer"); if(wrap_every) { MAX_DIGITS += MAX_DIGITS/wrap_every; } @@ -8151,7 +8151,7 @@ int __convert_mi64_base10_char_print_lead0(char char_buf[], uint32 n_alloc_chars double dtmp = 0.0; static uint64 *temp = 0x0; static uint32 tlen = 0; // #64-bit slots in current memalloc for *temp - ASSERT(HERE, fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!"); // Make sure these scaling powers have been inited + ASSERT(fabs(1.0 - TWO64FLOAT*TWO64FLINV) < 1e-14, "ERROR: TWO64FLOAT not inited!"); // Make sure these scaling powers have been inited /* Estimate # of decimal digits: */ curr_len = mi64_getlen(x, len); /* this checks that len > 0; need at least one digit, even if it = 0. curr_len guaranteed > 0. */ @@ -8160,7 +8160,7 @@ int __convert_mi64_base10_char_print_lead0(char char_buf[], uint32 n_alloc_chars if(temp) { free((void *)temp); temp = 0x0; } - temp = (uint64 *)calloc(curr_len, sizeof(uint64)); ASSERT(HERE, temp != 0x0, "alloc failed!"); + temp = (uint64 *)calloc(curr_len, sizeof(uint64)); ASSERT(temp != 0x0, "alloc failed!"); tlen = curr_len; } mi64_set_eq(temp, x, curr_len); @@ -8168,11 +8168,11 @@ int __convert_mi64_base10_char_print_lead0(char char_buf[], uint32 n_alloc_chars if(curr_len > 1) dtmp = x[curr_len-2]*TWO64FLINV; MAX_DIGITS = ceil( (curr_len-1)*log10_base + log((double)x[curr_len-1] + dtmp)/ln10 ); if(MAX_DIGITS > ndigit) { - ASSERT(HERE, 0, "ERROR: MAX_DIGITS > ndigit!"); + ASSERT(0, "ERROR: MAX_DIGITS > ndigit!"); } else { MAX_DIGITS = ndigit; } - ASSERT(HERE, MAX_DIGITS < n_alloc_chars, "Output string overflows buffer"); + ASSERT(MAX_DIGITS < n_alloc_chars, "Output string overflows buffer"); if(wrap_every) { MAX_DIGITS += MAX_DIGITS/wrap_every; } @@ -8243,7 +8243,7 @@ uint64 *convert_base10_char_mi64(const char*char_buf, uint32 *len) LEN_MAX = (uint32)ceil( (imax-i)/log10_base ); } // 01/09/2009: Add an extra zero-pad element here as workaround for bug in mi64_div called with differing-length operands: - mi64_vec = (uint64 *)calloc(LEN_MAX+1, sizeof(uint64)); ASSERT(HERE, mi64_vec != 0x0, "alloc failed!"); + mi64_vec = (uint64 *)calloc(LEN_MAX+1, sizeof(uint64)); ASSERT(mi64_vec != 0x0, "alloc failed!"); imin = i; for(i = imin; i < imax; i++) { c = char_buf[i]; @@ -8251,19 +8251,19 @@ uint64 *convert_base10_char_mi64(const char*char_buf, uint32 *len) free((void *)mi64_vec); *len = 0; return 0x0; } curr_digit = (uint64)(c - CHAROFFSET); - ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10"); + ASSERT(curr_digit < 10,"util.c: curr_digit < 10"); /* currsum *= 10, and check for overflow: */ tmp = mi64_mul_scalar(mi64_vec, (uint64)10, mi64_vec, *len); if(tmp != 0) { if(*len == LEN_MAX) { printf("ERROR: Mul-by-10 overflows in convert_base10_char_mi64: Offending input string = %s\n", char_buf); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } mi64_vec[(*len)++] = tmp; } *len += mi64_add_scalar(mi64_vec, curr_digit, mi64_vec, *len); - ASSERT(HERE, *len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(*len <= LEN_MAX,"len <= LEN_MAX"); } *len = LEN_MAX; /* Nominal length, so user knows how much memory was allocated */ return mi64_vec; @@ -8296,8 +8296,8 @@ and returns 1 if 2^(-p) == -1 (mod q) (which also means 2^p == -1), 0 otherwise. #endif uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], uint32 len, uint64*res) { - ASSERT(HERE, p != 0x0, "Null p-array pointer!"); - ASSERT(HERE, q != 0x0, "Null q-array pointer!"); + ASSERT(p != 0x0, "Null p-array pointer!"); + ASSERT(q != 0x0, "Null q-array pointer!"); uint32 pow2, FERMAT = mi64_isPow2(p,len,&pow2)<<1; // *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case #if MI64_POW_DBG uint32 dbg = FERMAT && pow2 == 256;//STREQ(&s0[convert_mi64_base10_char(s0, q, len, 0)], "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127"); @@ -8311,7 +8311,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], uint64 lead_chunk, lo64, cyout; uint32 lenP, lenQ, qbits, log2_numbits, start_index, zshift; #if MI64_POW_DBG - if(dbg) printf("mi64_twopmodq: F%u with k = %llu\n",pow2,k); + if(dbg) printf("mi64_twopmodq: F%u with k = %" PRIu64 "\n",pow2,k); #endif if(first_entry) { first_entry = FALSE; @@ -8321,8 +8321,8 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], x = (uint64 *)calloc((lenq_save ), sizeof(uint64)); lo = (uint64 *)calloc((2*lenq_save), sizeof(uint64)); } - lenP = mi64_getlen(p, len_p); ASSERT(HERE, lenP > 0, "0 exponent"); - lenQ = mi64_getlen(q, len); ASSERT(HERE, lenQ > 0, "0 modulus!"); + lenP = mi64_getlen(p, len_p); ASSERT(lenP > 0, "0 exponent"); + lenQ = mi64_getlen(q, len); ASSERT(lenQ > 0, "0 modulus!"); if(len_p > lenp_save) { lenp_save = len_p; pshift = (uint64 *)realloc(pshift, (len_p+1)*sizeof(uint64)); @@ -8335,21 +8335,21 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], x = (uint64 *)realloc(x , (lenQ )*sizeof(uint64)); lo = (uint64 *)realloc(lo , (2*lenQ )*sizeof(uint64)); } - ASSERT(HERE, pshift != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0, "alloc failed!"); + ASSERT(pshift != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0, "alloc failed!"); hi = lo + lenQ; // Pointer to high half of double-wide product #if MI64_POW_DBG - if(dbg) printf("mi64_twopmodq: k = %llu, len = %u, lenQ = %u\n",k,len,lenQ); + if(dbg) printf("mi64_twopmodq: k = %" PRIu64 ", len = %u, lenQ = %u\n",k,len,lenQ); #endif qbits = lenQ << 6; mi64_shrl_short(q, qhalf, 1, lenQ); /* (q >> 1) = (q-1)/2, since q odd. */ /* pshift = p + len*64 */ pshift[lenP] = mi64_add_scalar(p, lenQ*64, pshift, lenP); // April 2015: lenP ==> lenQ here! - ASSERT(HERE, !pshift[lenP], "pshift overflows!"); + ASSERT(!pshift[lenP], "pshift overflows!"); #if MI64_POW_DBG - if(dbg) printf("Init: k = %llu, lenP = %u, lenQ = %u\n",k,lenP,lenQ); + if(dbg) printf("Init: k = %" PRIu64 ", lenP = %u, lenQ = %u\n",k,lenP,lenQ); #endif log2_numbits = ceil(log(1.0*qbits)/log(2.0)); /* @@ -8369,7 +8369,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], */ /* Extract leftmost log2_numbits bits of pshift (if >= qbits, use the leftmost log2_numbits-1) and subtract from qbits: */ pbits = mi64_extract_lead64(pshift,len_p,&lo64); - ASSERT(HERE, pbits >= log2_numbits, "leadz64!"); + ASSERT(pbits >= log2_numbits, "leadz64!"); // if(pbits >= 64) lead_chunk = lo64>>(64-log2_numbits); // else @@ -8378,12 +8378,12 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], if(lead_chunk >= qbits) { lead_chunk >>= 1; #if MI64_POW_DBG - if(dbg) printf("lead%u = %llu\n", log2_numbits-1,lead_chunk); + if(dbg) printf("lead%u = %" PRIu64 "\n", log2_numbits-1,lead_chunk); #endif start_index = pbits-(log2_numbits-1); /* Use only the leftmost log2_numbits-1 bits */ } else { #if MI64_POW_DBG - if(dbg) printf("lead%u = %llu\n", log2_numbits ,lead_chunk); + if(dbg) printf("lead%u = %" PRIu64 "\n", log2_numbits ,lead_chunk); #endif start_index = pbits-log2_numbits; } @@ -8405,7 +8405,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], Init qinv = q. This formula returns the correct bottom 5 bits of qinv, and we double the number of correct bits on each of the subsequent iterations. */ - ASSERT(HERE, (q[0] & (uint64)1) == 1, "q must be odd!"); + ASSERT((q[0] & (uint64)1) == 1, "q must be odd!"); mi64_clear(qinv, lenQ); /* Newton iteration involves repeated steps of form @@ -8429,7 +8429,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], } // Check the computed inverse: mi64_mul_vector_lo_half(q, qinv, x, lenQ); - ASSERT(HERE, mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!"); + ASSERT(mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!"); #if MI64_POW_DBG if(dbg) { printf("q = %s\n", &cbuf[convert_mi64_base10_char(cbuf, q , lenQ, 0)]); @@ -8455,14 +8455,14 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], if(dbg) printf("q*lo/2^%u = %s\n", (lenQ<<6), &cbuf[convert_mi64_base10_char(cbuf, lo, lenQ, 0)]); #endif /* hi = 0 in this instance, which simplifies things. */ - cyout = mi64_sub(q, lo, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_sub(q, lo, x, lenQ); ASSERT(cyout == 0ull, ""); if(mi64_test_bit(pshift, j)) { /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(mi64_cmpugt(x, qhalf, lenQ)) { cyout = mi64_add(x, x, x, lenQ); cyout = mi64_sub(x, q, x, lenQ); } else { - cyout = mi64_add(x, x, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_add(x, x, x, lenQ); ASSERT(cyout == 0ull, ""); } } #if MI64_POW_DBG @@ -8494,7 +8494,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], if(!mi64_cmp_eq(lo,x,lenQ)) { printf("lo = MULH_QFERM = %s\n", &cbuf[convert_mi64_base10_char(cbuf,lo, lenQ, 0)] ); printf("lo = MULH = %s\n", &cbuf[convert_mi64_base10_char(cbuf, x, lenQ, 0)] ); - printf("Mismatch! pow2 = %u, k = %llu\n",pow2,k); + printf("Mismatch! pow2 = %u, k = %" PRIu64 "\n",pow2,k); exit(0); } } @@ -8514,14 +8514,14 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], cyout = mi64_sub(q, lo, lo, lenQ); cyout = mi64_add(lo, hi, x, lenQ); } else { - cyout = mi64_sub(hi, lo, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_sub(hi, lo, x, lenQ); ASSERT(cyout == 0ull, ""); } if(mi64_test_bit(pshift, j)) { #if MI64_POW_DBG if(dbg) printf("2x...\n"); #endif - ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q"); + ASSERT(mi64_cmpult(x, q, lenQ), "x >= q"); /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(mi64_cmpugt(x, qhalf, lenQ)) { #if MI64_POW_DBG @@ -8530,7 +8530,7 @@ uint32 mi64_twopmodq(const uint64 p[], uint32 len_p, const uint64 k, uint64 q[], cyout = mi64_add(x, x, x, lenQ); cyout = mi64_sub(x, q, x, lenQ); } else { - cyout = mi64_add(x, x, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_add(x, x, x, lenQ); ASSERT(cyout == 0ull, ""); } } #if MI64_POW_DBG @@ -8576,7 +8576,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 static uint32 first_entry = TRUE; // Quick computation of number of uint64 needed to hold current q: - ASSERT(HERE, (k != 0) && ((k2>>1) == k), "2*k overflows!"); // Make sure 2*k does not overflow + ASSERT((k != 0) && ((k2>>1) == k), "2*k overflows!"); // Make sure 2*k does not overflow j = (p+1)&63; // p+1 mod 64, needed since q = 2*k*MMp+1 ~= k*MM(p+1) lenP = ((p+1) + 63)>>6; // #64-bit words needed lo64 = k; // Copy of k @@ -8586,7 +8586,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 lenQ = lenP; } #if MI64_POW_DBG - if(dbg) { printf("mi64_twopmodq_qmmp: k = %llu, lenP = %u, lenQ = %u\n",k,lenP,lenQ); } + if(dbg) { printf("mi64_twopmodq_qmmp: k = %" PRIu64 ", lenP = %u, lenQ = %u\n",k,lenP,lenQ); } #endif if(first_entry || (p != psave) || (lenQ != lenQ_save)) @@ -8594,15 +8594,15 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 first_entry = FALSE; psave = p; free((void *)pshift); - pshift = (uint64 *)calloc((lenP+1), sizeof(uint64)); ASSERT(HERE, pshift != 0x0, "calloc of pshift[] failed!"); + pshift = (uint64 *)calloc((lenP+1), sizeof(uint64)); ASSERT(pshift != 0x0, "calloc of pshift[] failed!"); pshift[0] = 1; mi64_shl(pshift, pshift, p, lenP); // 2^p mi64_sub_scalar(pshift, 1, pshift, lenP); // M(p) = 2^p-1 /* pshift = p + len*64: */ pshift[lenP] = mi64_add_scalar(pshift, lenP*64, pshift, lenP); - ASSERT(HERE, !pshift[lenP], "pshift overflows!"); + ASSERT(!pshift[lenP], "pshift overflows!"); #if MI64_POW_DBG - if(dbg) { printf("mi64_twopmodq_qmmp: Init: k = %llu, lenP = %u, lenQ = %u\n",k,lenP,lenQ); } + if(dbg) { printf("mi64_twopmodq_qmmp: Init: k = %" PRIu64 ", lenP = %u, lenQ = %u\n",k,lenP,lenQ); } #endif lenQ_save = lenQ; free((void *)q ); @@ -8616,7 +8616,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 x = (uint64 *)calloc((lenQ), sizeof(uint64)); lo = (uint64 *)calloc((2*lenQ), sizeof(uint64)); hi = lo + lenQ; /* Pointer to high half of double-wide product */ - ASSERT(HERE, q != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0 && hi != 0x0, "alloc failed!"); + ASSERT(q != 0x0 && qhalf != 0x0 && qinv != 0x0 && x != 0x0 && lo != 0x0 && hi != 0x0, "alloc failed!"); qbits = lenQ << 6; log2_numbits = ceil(log(1.0*qbits)/log(2.0)); @@ -8637,7 +8637,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 */ /* Extract leftmost log2_numbits bits of pshift (if >= qbits, use the leftmost log2_numbits-1) and subtract from qbits: */ pbits = mi64_extract_lead64(pshift,lenP,&lo64); - ASSERT(HERE, pbits >= log2_numbits, "leadz64!"); + ASSERT(pbits >= log2_numbits, "leadz64!"); // if(pbits >= 64) lead_chunk = lo64>>(64-log2_numbits); // else @@ -8646,12 +8646,12 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 if(lead_chunk >= qbits) { lead_chunk >>= 1; #if MI64_POW_DBG - if(dbg) { printf("lead%u = %llu\n", log2_numbits-1,lead_chunk); } + if(dbg) { printf("lead%u = %" PRIu64 "\n", log2_numbits-1,lead_chunk); } #endif start_index = pbits-(log2_numbits-1); /* Use only the leftmost log2_numbits-1 bits */ } else { #if MI64_POW_DBG - if(dbg) { printf("lead%u = %llu\n", log2_numbits ,lead_chunk); } + if(dbg) { printf("lead%u = %" PRIu64 "\n", log2_numbits ,lead_chunk); } #endif start_index = pbits-log2_numbits; } @@ -8670,8 +8670,8 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 q[0] = 1; mi64_shl(q, q, p, lenQ); mi64_sub_scalar(q, 1, q, lenQ); // M(p) = 2^p-1 cyout = mi64_mul_scalar(q, k2, q, lenQ); - ASSERT(HERE, !cyout, "2.k.M(p) overflows!"); // 2.k.M(p) - ASSERT(HERE, 0 != q[lenQ-1], "Excessive word size allocated for q!"); + ASSERT(!cyout, "2.k.M(p) overflows!"); // 2.k.M(p) + ASSERT(0 != q[lenQ-1], "Excessive word size allocated for q!"); mi64_add_scalar(q, 1ull, q, lenQ); // q = 2.k.M(p) + 1 mi64_shrl_short(q, qhalf, 1, lenQ); /* (q >> 1) = (q-1)/2, since q odd. */ #else @@ -8679,8 +8679,8 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 j = p>>6; // p/64; the set-bit in 2^p goes into the (j)th word of q[] q[j] = ( 1ull << (p-(j<<6)) ); mi64_sub_scalar(q, 1, q, lenQ); // M(p) = 2^p-1 - cyout = mi64_mul_scalar(q, k2, q, lenQ); ASSERT(HERE, !cyout, "2.k.M(p) overflows!"); // 2.k.M(p) - ASSERT(HERE, 0 != q[lenQ-1], "Excessive word size allocated for q!"); + cyout = mi64_mul_scalar(q, k2, q, lenQ); ASSERT(!cyout, "2.k.M(p) overflows!"); // 2.k.M(p) + ASSERT(0 != q[lenQ-1], "Excessive word size allocated for q!"); mi64_add_scalar(q, 1ull, q, lenQ); // q = 2.k.M(p) + 1 mi64_shrl_short_short(q,qhalf, 1, lenQ); // qhalf = (q >> 1) = (q-1)/2, since q odd. #endif @@ -8688,7 +8688,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 Find modular inverse (mod 2^qbits) of q in preparation for modular multiply. q must be odd for Montgomery-style modmul to work. */ - ASSERT(HERE, (q[0] & (uint64)1) == 1, "q must be odd!"); + ASSERT((q[0] & (uint64)1) == 1, "q must be odd!"); mi64_clear(qinv, lenQ); /* Newton iteration involves repeated steps of form @@ -8712,7 +8712,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 } // Check the computed inverse: mi64_mul_vector_lo_half(q, qinv, x, lenQ); - ASSERT(HERE, mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!"); + ASSERT(mi64_cmp_eq_scalar(x, 1ull, lenQ), "Bad Montmul inverse!"); #if MI64_POW_DBG if(dbg) { printf("q = %s\n", &cbuf[convert_mi64_base10_char(cbuf, q , lenQ, 0)]); @@ -8738,17 +8738,17 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 #endif /* hi = 0 in this instance, which simplifies things. */ - cyout = mi64_sub(q, lo, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_sub(q, lo, x, lenQ); ASSERT(cyout == 0ull, ""); // mi64_test_bit(pshift, j) always true for this portion of MMp powering - ASSERT(HERE, mi64_test_bit(pshift, j), "pshift bit = 0 for pre-loop step!"); - ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q"); + ASSERT(mi64_test_bit(pshift, j), "pshift bit = 0 for pre-loop step!"); + ASSERT(mi64_cmpult(x, q, lenQ), "x >= q"); /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(mi64_cmpugt(x, qhalf, lenQ)) { cyout = mi64_add(x, x, x, lenQ); cyout = mi64_sub(x, q, x, lenQ); } else { - cyout = mi64_add(x, x, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_add(x, x, x, lenQ); ASSERT(cyout == 0ull, ""); } #if MI64_POW_DBG @@ -8782,21 +8782,21 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 cyout = mi64_sub(q, lo, lo, lenQ); cyout = mi64_add(lo, hi, x, lenQ); } else { - cyout = mi64_sub(hi, lo, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_sub(hi, lo, x, lenQ); ASSERT(cyout == 0ull, ""); } #if MI64_POW_DBG if(dbg) { printf("x = %s\n",&cbuf[convert_mi64_base10_char(cbuf, x, lenQ, 0)]); } #endif // mi64_test_bit(pshift, j) always true for this portion of MMp powering - ASSERT(HERE, mi64_test_bit(pshift, j), "pshift bit = 0!"); + ASSERT(mi64_test_bit(pshift, j), "pshift bit = 0!"); #if MI64_POW_DBG if(!mi64_cmpult(x, q, lenQ)) { - printf("x < q test failed for k = %llu, j = %u!\n",k,j); + printf("x < q test failed for k = %" PRIu64 ", j = %u!\n",k,j); } if(dbg) { printf("2x...\n"); } #else - ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q"); + ASSERT(mi64_cmpult(x, q, lenQ), "x >= q"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ @@ -8804,7 +8804,7 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 cyout = mi64_add(x, x, x, lenQ); cyout = mi64_sub(x, q, x, lenQ); } else { - cyout = mi64_add(x, x, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_add(x, x, x, lenQ); ASSERT(cyout == 0ull, ""); } } for(; j >= 0; j--) @@ -8818,17 +8818,17 @@ uint32 mi64_twopmodq_qmmp(const uint64 p, const uint64 k, uint64*res)//, uint32 cyout = mi64_sub(q, lo, lo, lenQ); cyout = mi64_add(lo, hi, x, lenQ); } else { - cyout = mi64_sub(hi, lo, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_sub(hi, lo, x, lenQ); ASSERT(cyout == 0ull, ""); } if((pshift[0] >> j) & (uint64)1) { - ASSERT(HERE, mi64_cmpult(x, q, lenQ), "x >= q"); + ASSERT(mi64_cmpult(x, q, lenQ), "x >= q"); /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(mi64_cmpugt(x, qhalf, lenQ)) { cyout = mi64_add(x, x, x, lenQ); cyout = mi64_sub(x, q, x, lenQ); } else { - cyout = mi64_add(x, x, x, lenQ); ASSERT(HERE, cyout == 0ull, ""); + cyout = mi64_add(x, x, x, lenQ); ASSERT(cyout == 0ull, ""); } } } diff --git a/src/pairFFT_mul.c b/src/pairFFT_mul.c index 4b98958a..03890546 100755 --- a/src/pairFFT_mul.c +++ b/src/pairFFT_mul.c @@ -122,7 +122,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int static int init_sse2 = FALSE; int thr_id = -1; // No multithread support yet. - ASSERT(HERE, ((uint32)FFT_MUL_BASE >> 16) == 1, "FFT_MUL_BASE != 2^16"); + ASSERT(((uint32)FFT_MUL_BASE >> 16) == 1, "FFT_MUL_BASE != 2^16"); /*** Having a separate init block for the big index array allows us to init this prior @@ -131,7 +131,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int if(INIT_ARRAYS) { /* In init mode, x-input array used for temporary storage: */ - ASSERT(HERE, x != 0x0, "if INIT_ARRAYS = TRUE, x-input array must be non-null!"); + ASSERT(x != 0x0, "if INIT_ARRAYS = TRUE, x-input array must be non-null!"); /* Reset this on an INIT_ARRAYS call to ensure that the radix_set != radix_set_save code below gets executed in that case: */ @@ -145,7 +145,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int n2inv = 1.0/(N2); /* Only power-of-2 FFT lengths supported for now: */ - ASSERT(HERE, (n>>trailz32(n)) == 1,"Only power-of-2 FFT lengths supported!"); + ASSERT((n>>trailz32(n)) == 1,"Only power-of-2 FFT lengths supported!"); // Use get_fft_radices' zero-index radix set (guaranteed to be available if the FFT length is supported) // to find how many different radsets available at this length, then loop over them (including the 0-one) @@ -157,13 +157,13 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int retval = get_fft_radices(n>>10, radix_set, &NRADICES, RADIX_VEC, 10); if(retval == ERR_FFTLENGTH_ILLEGAL) { sprintf(char_str, "ERROR: length %d = %d K not available.\n", n, n>>10); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } else if(retval == ERR_RADIXSET_UNAVAILABLE) { sprintf(char_str, "ERROR: radix set %10d not available.\n",radix_set); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } else if(retval != 0) { sprintf(char_str, "ERROR: unknown return value %d from get_fft_radix; N = %d, kblocks = %u, radset = %u.\n", retval, n, kblocks, radix_set); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } // Make sure n/radix_vec0 >= 1024: if(n/RADIX_VEC[0] < 1024) @@ -171,7 +171,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int if( (RADIX_VEC[NRADICES-1] == 16) && (RADIX_VEC[0] == 8 || RADIX_VEC[0] == 16 || RADIX_VEC[0] == 32) ) break; } - ASSERT(HERE, radix_set < nradsets, "Unable to find suitable radix set!"); + ASSERT(radix_set < nradsets, "Unable to find suitable radix set!"); radix_vec0 = RADIX_VEC[0]; radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)radix_vec0)); nchunks = radix_vec0>>1; @@ -179,24 +179,24 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int /* My array padding scheme requires N/radix_vec0 to be a power of 2, and to be >= 2^DAT_BITS, where the latter parameter is set in the Mdata.h file: */ if(n%radix_vec0 != 0) { - ASSERT(HERE, 0, "ERROR: RADIX_VEC[0] does not divide N!\n"); + ASSERT(0, "ERROR: RADIX_VEC[0] does not divide N!\n"); } /* Make sure n/radix_vec0 is a power of 2: */ i = n/radix_vec0; if((i >> trailz32(i)) != 1) { - ASSERT(HERE, 0, "ERROR: n/RADIX_VEC[0] not a power of 2!\n"); + ASSERT(0, "ERROR: n/RADIX_VEC[0] not a power of 2!\n"); } /*...Set the array padding parameters - only use array padding elements for runlengths > 32K. */ if(DAT_BITS < 31) { /*...If array padding turned on, check that the blocklength divides the unpadded runlength... */ - ASSERT(HERE, ((n >> DAT_BITS) << DAT_BITS) == n,"ERROR: blocklength does not divide runlength!"); + ASSERT(((n >> DAT_BITS) << DAT_BITS) == n,"ERROR: blocklength does not divide runlength!"); /* Now make sure n/RADIX_VEC[0] is sufficiently large (unless n < 2^DAT_BITS, in which case it doesn't matter): */ if(i < (1 << DAT_BITS)) { sprintf(char_str, "ERROR: n/RADIX_VEC[0] must be >= %u!\n", (1 << DAT_BITS)); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } } @@ -217,11 +217,11 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int } if(mm*RADIX_VEC[NRADICES-1] != N2) { - ASSERT(HERE, 0, "product of radices not equal to complex vector length\n"); + ASSERT(0, "product of radices not equal to complex vector length\n"); } /* index = (int *)calloc(k,sizeof(int)); */ - index_ptmp = ALLOC_INT(index_ptmp, k); if(!index_ptmp){ ASSERT(HERE, 0, "unable to allocate array INDEX in pairFFT_mul.\n"); } + index_ptmp = ALLOC_INT(index_ptmp, k); if(!index_ptmp){ ASSERT(0, "unable to allocate array INDEX in pairFFT_mul.\n"); } index = ALIGN_INT(index_ptmp); /*...Forward (DIF) FFT sincos data are in bit-reversed order. We define a separate last-pass twiddles @@ -242,7 +242,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; break; default : sprintf(char_str, "radix[0] = %d not available.\n",RADIX_VEC[i]); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } for(i=1; i < NRADICES; i++) @@ -279,7 +279,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; radix_prim[l++] = 2; break; default : sprintf(char_str, "radix %d not available. Halting...\n",RADIX_VEC[i]); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } } nradices_prim = l; @@ -297,7 +297,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int */ default : sprintf(char_str, "ERROR: radix %d not available for _pairFFT dyadic-mul step.\n",RADIX_VEC[NRADICES-1]); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } return; @@ -307,15 +307,15 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int /* If FORWARD_FFT_ONLY = TRUE, at least the X-ptr should be valid: */ n_inputs = 1; if((uint32)FORWARD_FFT_ONLY > 2) { - ASSERT(HERE, 0, "FORWARD_FFT_ONLY not a any-nonzero-denotes-TRUE param: legal TRUE-values are 1 and 2!"); + ASSERT(0, "FORWARD_FFT_ONLY not a any-nonzero-denotes-TRUE param: legal TRUE-values are 1 and 2!"); } else if(FORWARD_FFT_ONLY == 1) { - ASSERT(HERE, x != 0x0 && z == 0x0, "FORWARD_FFT_ONLY requires X-input nonzero and Z-input null!"); + ASSERT(x != 0x0 && z == 0x0, "FORWARD_FFT_ONLY requires X-input nonzero and Z-input null!"); /* One or two inputs to be processed? */ ivec[0] = x; ivec[1] = y; n_inputs += (y != 0x0); } else { // FORWARD_FFT_ONLY = 0 and 2 behave similarly - ASSERT(HERE, x != 0x0 && y != 0x0, "FORWARD_FFT_ONLY = FALSE requires Non-null X,Y-inputs!"); + ASSERT(x != 0x0 && y != 0x0, "FORWARD_FFT_ONLY = FALSE requires Non-null X,Y-inputs!"); /* One input to be processed: */ ivec[0] = x; ab_mul = y; cd_mul = z; @@ -343,14 +343,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int for(i = 0; i < NRADICES; i++) { if(RADIX_VEC[i] == 0) { sprintf(cbuf, "RADIX_VEC[i = %d] zero, for i < [NRADICES = %d]!",i,NRADICES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } radix_set_save[i] = RADIX_VEC[i]; } for(i = NRADICES; i < 10; i++) { if(RADIX_VEC[i] != 0) { sprintf(cbuf, "RADIX_VEC[i = %d] nonzero, for i >= [NRADICES = %d]!",i,NRADICES); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } radix_set_save[i] = 0; } @@ -360,7 +360,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int if(n%radix_vec0 != 0) { sprintf(cbuf ,"RADIX_VEC[0] does not divide N!\n"); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* Make sure n/RADIX_VEC[0] is a power of 2: */ @@ -368,7 +368,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int if((i >> trailz32(i)) != 1) { sprintf(cbuf ,"n/RADIX_VEC[0] not a power of 2!\n"); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } if(DAT_BITS < 31) { @@ -376,14 +376,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int if(i < (1 << DAT_BITS)) { sprintf(cbuf ,"vn/RADIX_VEC[0] must be >= %u!\n", (1 << DAT_BITS)); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* We also have a lower limit on 2^DAT_BITS set by the pairFFT_mul routine: */ if((1 << DAT_BITS) < 2*RADIX_VEC[NRADICES-1]) { sprintf(cbuf ,"final FFT radix may not exceed = %u!\n", (1 << (DAT_BITS-1))); fprintf(stderr,"%s", cbuf); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } @@ -415,14 +415,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int NRT_BITS = (uint32)(log(sqrt(1.0*n))/log(2.0) + 0.5); NRT = 1 << NRT_BITS; NRTM1 = NRT - 1; if(n%NRT) { sprintf(cbuf,"ERROR: NRT does not divide N!\n"); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /*...The rt0 array stores the (0:NRT-1)th powers of the [N2]th root of unity (i.e. will be accessed using the lower lg(NRT) bits of the integer sincos index): */ rt0_ptmp = ALLOC_COMPLEX(rt0_ptmp, NRT); - if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!rt0_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt0 = ALIGN_COMPLEX(rt0_ptmp); qt = i64_to_q((int64)N2); @@ -443,7 +443,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int (and will be accessed using the upper bits, , of the integer sincos index): */ rt1_ptmp = ALLOC_COMPLEX(rt1_ptmp, n/(2*NRT)); - if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!rt1_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array RT1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } rt1 = ALIGN_COMPLEX(rt1_ptmp); qn = i64_to_q((int64)NRT); @@ -468,7 +468,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int /* 8/23/2004: Need to allocate an extra element here to account for the padding element that gets inserted when radix_vec0 is odd: */ block_index = (int *)calloc((radix_vec0+1),sizeof(int)); - if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!block_index){ sprintf(cbuf,"ERROR: unable to allocate array BLOCK_INDEX in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Examples - We only allow powers of 2 here, for the more general case cf. mers_mod_square.c: @@ -522,7 +522,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int // Do two loop executions: for(j = 0; j < 2; j++) { - if(!(l >= 0 && l < radix_vec0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!(l >= 0 && l < radix_vec0)) { sprintf(cbuf,"ERROR 10 in %s.c\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } block_index[ii] = l; //fprintf(stderr,"%3d %3d\n",ii,l); ii++; // every time we execute this innermost loop (which corresponds to one // block of FFT data being processed), increment the linear array index @@ -543,14 +543,14 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int } /* End of Main loop */ /* arrays storing the index values needed for the parallel-block wrapper/square scheme: */ - if( !(ws_i = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_j1 = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_j2 = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_j2_start = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_k = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_m = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_blocklen = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - if( !(ws_blocklen_sum = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if( !(ws_i = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_I in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_j1 = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_j2 = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_j2_start = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_J2_START in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_k = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_K in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_m = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_M in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_blocklen = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + if( !(ws_blocklen_sum = (int *)calloc(radix_vec0,sizeof(int))) ) { sprintf(cbuf,"ERROR: unable to allocate array WS_BLOCKLEN_SUM in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /*...Final DIF pass, wrapper/squaring and initial DIT pass are all done in-place. This combines data from both the l1 and l2-block, except in the case ii = 0 @@ -583,7 +583,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int */ default : sprintf(cbuf,"ERROR: Final radix %d not available for %s. Halting...\n",RADIX_VEC[NRADICES-1],func); - fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } } } @@ -614,7 +614,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int radix32_dif_pass1(a,n); break; default : sprintf(cbuf,"ERROR: radix %d not available for dif_pass1. Halting...\n",radix_vec0); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } /* Break the remaining portion of the FFT into radix0 blocks, and in each pass of the resulting loop @@ -648,10 +648,10 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int case 32 : ierr = radix32_ditN_cy_dif1 (a,n, 0, 0,0x0,0x0,0x0,0x0,0x0,0x0, 0x0, 0,&fracmax,0); break; default : - sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix_vec0); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); + sprintf(cbuf,"ERROR: radix %d not available for ditN_cy_dif1. Halting...\n",radix_vec0); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } /* Nonzero remaining carries are instantly fatal: */ - ASSERT(HERE, ierr == 0, "pairFFT_mul: Fatal: carry routine return error!"); + ASSERT(ierr == 0, "pairFFT_mul: Fatal: carry routine return error!"); /*...Now do the fractional error check. Any fractional part > 0.40625 generates a warning... */ // Dec 2014: Bump threshold up from ( >= 0.4 ) to ( > 0.40625 ): @@ -678,7 +678,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int radix32_dit_pass1(a,n); break; default : sprintf(char_str, "radix %d not available for final IFFT pass!\n",radix_vec0); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } /*...And re-NINT the 'undo pass' data, which may differ from pure-int by some tiny amount: */ @@ -703,7 +703,7 @@ void pairFFT_mul(double x[], double y[], double z[], int n, int INIT_ARRAYS, int { fprintf(stderr,"%s: max_fp > 0.01! Value = %20.10f\n",func,max_fp); fprintf(stderr,"Check your build for inadvertent mixing of SSE2 and non-SSE2-enabled files!\n"); - ASSERT(HERE, max_fp < 0.01,"max_fp < 0.01"); + ASSERT(max_fp < 0.01,"max_fp < 0.01"); } // Restore input value of MODULUS_TYPE: @@ -754,7 +754,7 @@ if(FORWARD_FFT_ONLY != 2) // Cf. comments in pairFFT_mul about this radix32_dif_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break; default : sprintf(cbuf,"pairFFT_mul_process_chunk: ERROR: radix %d not available for dif_pass. Halting...\n",RADIX_VEC[i]); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } k += mm*radix_vec0; @@ -783,7 +783,7 @@ if(FORWARD_FFT_ONLY != 2) // Cf. comments in pairFFT_mul about this */ default : sprintf(char_str, "pairFFT_mul_process_chunk: ERROR: radix %d not available for dyadic mul step.\n",RADIX_VEC[NRADICES-1]); - ASSERT(HERE, 0, char_str); + ASSERT(0, char_str); } } /* In forward-FFT-only mode, do none of the IFFT passes: */ @@ -797,7 +797,7 @@ if(FORWARD_FFT_ONLY != 2) // Cf. comments in pairFFT_mul about this { /* Get block index of the chunk of contiguous data to be processed: */ l = block_index[ii + j]; - ASSERT(HERE, l >= 0,"pair_FFTmul_process_chunk: l >= 0"); + ASSERT(l >= 0,"pair_FFTmul_process_chunk: l >= 0"); /* Quick-n-dirty way of generating the correct starting values of k, mm and incr - simply use the skeleton of the forward (DIF) loop, sans the i = NRADICES-2 pass @@ -837,7 +837,7 @@ if(FORWARD_FFT_ONLY != 2) // Cf. comments in pairFFT_mul about this radix32_dit_pass(&a[jstart],n,rt0,rt1,&index[k+koffset],mm,incr,init_sse2,thr_id); break; default : sprintf(cbuf,"pairFFT_mul_process_chunk: ERROR: radix %d not available for dit_pass. Halting...\n",RADIX_VEC[i]); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } } /* end i-loop */ } /* end j-loop */ diff --git a/src/pm1.c b/src/pm1.c index 017aee59..8fcf1d0b 100755 --- a/src/pm1.c +++ b/src/pm1.c @@ -43,7 +43,7 @@ Then to run, e.g. uint32 PM1_S2_NBUF = 0; // # of floating-double residue-length memblocks available for Stage 2 uint32 B1 = 0; uint64 B2 = 0ull, B2_start = 0ull; - char cbuf[STR_MAX_LEN]; + char cbuf[STR_MAX_LEN*2]; uint32 SYSTEM_RAM, MAX_RAM_USE; // Total usable main memory size, and max. amount of that to use per instance, in MB double MME; #else @@ -202,9 +202,9 @@ uint32 pm1_set_bounds(const uint64 p, const uint32 n, const uint32 tf_bits, cons // Force B1 >= 10^4 to avoid possible large-buffer-count underflow of qlo in stage 2. // Conservatively use (#bits in Stage 1 prime-powers product ~= 1.5*B1), must fit into a uint32, thus B1_max = 2^33/3 = 2863311530: i64 = p>>7; - ASSERT(HERE, i64 <= 2863311530ull, "Stage 1 prime-powers product must fit into a uint32; default B1 for your exponent is too large!"); + ASSERT(i64 <= 2863311530ull, "Stage 1 prime-powers product must fit into a uint32; default B1 for your exponent is too large!"); B1 = MAX((uint32)i64,10000); // #bits in Stage 1 prime-powers product ~= 1.4*B1, so e.g. B1 = p/128 gives a ~= 1.1*p/100 bits - B1 = (B1 + 99999)*inv100k; B1 *= 100000; ASSERT(HERE, B1 >= 100000, "B1 unacceptably small!"); // Round up to nearest 100k: + B1 = (B1 + 99999)*inv100k; B1 *= 100000; ASSERT(B1 >= 100000, "B1 unacceptably small!"); // Round up to nearest 100k: if(PM1_S2_NBUF < 24) { sprintf(cbuf,"pm1_set_bounds: Insufficient free memory for Stage 2 ... will run only Stage 1.\n"); mlucas_fprint(cbuf,pm1_standlone+1); @@ -259,15 +259,15 @@ uint32 pm1_set_bounds(const uint64 p, const uint32 n, const uint32 tf_bits, cons pm1_bigstep_size(&PM1_S2_NBUF, &bigstep, &stage2_mem_multiple,psmall); if(bigstep != 210 && bigstep != 330 && bigstep != 420 && bigstep != 660 && bigstep != 840) { sprintf(cbuf,"%u is unsupported value of bigstep!",bigstep); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } double f2 = 30.0/(1 - 0.93*log10(log10(0.37037037037037037037*(double)PM1_S2_NBUF))); B2 = (uint64)(f2*(double)B1); // Round to nearest 1m: - B2 = (B2 + 999999)*inv1m; B2 *= 1000000; ASSERT(HERE, B2 >= 1000000, "B2 unacceptably small!"); + B2 = (B2 + 999999)*inv1m; B2 *= 1000000; ASSERT(B2 >= 1000000, "B2 unacceptably small!"); } pm1_check_bounds(); // This sanity-checks the bounds and sets B2_start = B1 if unset. - sprintf(cbuf,"Setting default p-1 stage bounds b1 = %u, b2_start = %llu, b2 = %llu.\n",B1,B2_start,B2); + sprintf(cbuf,"Setting default p-1 stage bounds b1 = %u, b2_start = %" PRIu64 ", b2 = %" PRIu64 ".\n",B1,B2_start,B2); mlucas_fprint(cbuf,pm1_standlone+1); return 1; } @@ -283,17 +283,17 @@ uint32 pm1_check_bounds() { if(B1 < 10000) { sprintf(cbuf,"The minimum P-1 Stage 1 bound = 10000; resetting to that.\n"); mlucas_fprint(cbuf,pm1_standlone+1); B1 = 10000; } #endif if(B2_start) { - if(B2_start > B2) { sprintf(cbuf,"P-1 Stage 2 starting bound [= %llu] must be less than or equal to Stage 2 bound [= %llu].\n",B2_start,B2); break; } - if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %llu] must be greater than or equal to that of Stage 1 [= %u].\n",B2,B1); break; } + if(B2_start > B2) { sprintf(cbuf,"P-1 Stage 2 starting bound [= %" PRIu64 "] must be less than or equal to Stage 2 bound [= %" PRIu64 "].\n",B2_start,B2); break; } + if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %" PRIu64 "] must be greater than or equal to that of Stage 1 [= %u].\n",B2,B1); break; } } else if(B2) { // Stage 2 takes off where Stage 1 left off - if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %llu] set nonzero but < Stage 1 bound [= %u] ... no Stage 2 will be run.\n",B2,B1); } + if(B1 > B2) { sprintf(cbuf,"P-1 Stage 2 bound [= %" PRIu64 "] set nonzero but < Stage 1 bound [= %u] ... no Stage 2 will be run.\n",B2,B1); } B2_start = B1; } else { // No Stage 2 - Can set both of these to 0 or B1 in this case B2_start = B2 = (uint64)0; } return 1; // B1 and B2 legal. } - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); return 0; // Bzzt! } @@ -333,7 +333,7 @@ global would be needed to store that - and remultiply by the appropriate one for */ uint32 compute_pm1_s1_product(const uint64 p) { const double A = 1.1; - ASSERT(HERE, B1 > 0, "Call to compute_pm1_s1_product needs Stage 1 bound global B1 to be set!"); + ASSERT(B1 > 0, "Call to compute_pm1_s1_product needs Stage 1 bound global B1 to be set!"); double ln = log(B1), lg = ln*ILG2; uint32 i,len = 0,nmul,nbits,ebits = (uint32)((lg-A)*B1/(ln-A)); uint64 iseed,maxmult; @@ -344,7 +344,7 @@ uint32 compute_pm1_s1_product(const uint64 p) { PM1_S1_PRODUCT = ALLOC_UINT64(PM1_S1_PRODUCT, s1p_alloc); if(!PM1_S1_PRODUCT ){ sprintf(cbuf, "ERROR: unable to allocate array PM1_S1_PRODUCT with %u linbs in main.\n",s1p_alloc); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } // (E.g. on restart) First see if a savefile holding the precomputed/bit-reversed product for this p and B1 exists: @@ -359,22 +359,22 @@ uint32 compute_pm1_s1_product(const uint64 p) { #endif // For M(p) want to seed the S1 prime-powers product with 2*p; for F(m) we want seed = 2^(m+2). Since in the latter // case our input p contains 2^m, can handle both cases via iseed = 4*p, giving an extra *2 in the Mersenne case: - iseed = p<<2; ASSERT(HERE, (iseed>>2) == p,"Binary exponent overflows (uint64)4*p in compute_pm1_s1_product!"); + iseed = p<<2; ASSERT((iseed>>2) == p,"Binary exponent overflows (uint64)4*p in compute_pm1_s1_product!"); len = pm1_s1_ppow_prod(iseed, B1, PM1_S1_PRODUCT, &nmul, &maxmult); PM1_S1_PROD_B1 = B1; nbits = (len<<6)-mi64_leadz(PM1_S1_PRODUCT,len); if(len > s1p_alloc) { sprintf(cbuf,"Size of S1 prime-powers product exceeds alloc of PM1_S1_PRODUCT[]!"); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } /* fprintf(stderr,"Product of Stage 1 prime powers used %u mi64_mul_scalar() calls; max-multiplier %u bits\n",nmul, 64-leadz64(maxmult)); fprintf(stderr,"Limbs of PM1_S1_PRODUCT, low to high:\n"); for(i = 0; i < len; i+=8) { - fprintf(stderr,"%llx,%llx,%llx,%llx,%llx,%llx,%llx,%llx\n",PM1_S1_PRODUCT[i],PM1_S1_PRODUCT[i+1],PM1_S1_PRODUCT[i+2],PM1_S1_PRODUCT[i+3],PM1_S1_PRODUCT[i+4],PM1_S1_PRODUCT[i+5],PM1_S1_PRODUCT[i+6],PM1_S1_PRODUCT[i+7]); + fprintf(stderr,"%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 ",%" PRIx64 "\n",PM1_S1_PRODUCT[i],PM1_S1_PRODUCT[i+1],PM1_S1_PRODUCT[i+2],PM1_S1_PRODUCT[i+3],PM1_S1_PRODUCT[i+4],PM1_S1_PRODUCT[i+5],PM1_S1_PRODUCT[i+6],PM1_S1_PRODUCT[i+7]); } exit(0); */ - // fprintf(stderr,"PM1_S1_PRODUCT limbs[%u,%u,...,1,0] = %016llX,%016llX,...,%016llX,%016llX\n",len-1,len-2,PM1_S1_PRODUCT[len-1],PM1_S1_PRODUCT[len-2],PM1_S1_PRODUCT[1],PM1_S1_PRODUCT[0]); + // fprintf(stderr,"PM1_S1_PRODUCT limbs[%u,%u,...,1,0] = %016" PRIX64 ",%016" PRIX64 ",...,%016" PRIX64 ",%016" PRIX64 "\n",len-1,len-2,PM1_S1_PRODUCT[len-1],PM1_S1_PRODUCT[len-2],PM1_S1_PRODUCT[1],PM1_S1_PRODUCT[0]); // Ignore the #iters != 0 user needed to set to invoke selfTest mode, replace with nbits in S1 prime-powers product: PM1_S1_PROD_BITS = nbits-1; // Leftmost bit accounted for by setting initial seed in the LR-modular binary powering // Bit-reverse s1 product, leaving leftmost 1-bit off. REMEMBER, this puts the 0-bits corresponding to the @@ -393,7 +393,7 @@ uint32 compute_pm1_s1_product(const uint64 p) { #ifndef PM1_STANDALONE // Write result to savefile: if(!write_pm1_s1_prod(savefile, p, PM1_S1_PROD_BITS, PM1_S1_PRODUCT, PM1_S1_PROD_RES64)) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"WARN: Unable to write precomputed/bit-reversed Stage 1 prime-powers product to savefile %s.\n",savefile); + snprintf(cbuf,STR_MAX_LEN*2,"WARN: Unable to write precomputed/bit-reversed Stage 1 prime-powers product to savefile %s.\n",savefile); mlucas_fprint(cbuf,pm1_standlone+1); } } // endif(read_pm1_s1_prod) @@ -401,7 +401,7 @@ uint32 compute_pm1_s1_product(const uint64 p) { sprintf(cbuf,"Product of Stage 1 prime powers with b1 = %u is %u bits (%u limbs), vs estimated %u. Setting PRP_BASE = 3.\n",B1,PM1_S1_PROD_BITS+1,len,ebits); mlucas_fprint(cbuf,pm1_standlone+1); PRP_BASE = 3; - sprintf(cbuf,"BRed (PM1_S1_PRODUCT sans leading bit) has %u limbs, Res64 = %llu\n",len,PM1_S1_PROD_RES64); + sprintf(cbuf,"BRed (PM1_S1_PRODUCT sans leading bit) has %u limbs, Res64 = %" PRIu64 "\n",len,PM1_S1_PROD_RES64); mlucas_fprint(cbuf,pm1_standlone+0); return len; // return actual #limbs of product, not initial overestimate } @@ -412,8 +412,8 @@ uint32 pm1_s1_ppow_prod(const uint64 iseed, const uint32 b1, uint64 accum[], uin uint32 p = 2,i,j,len,maxbits = 64-leadz64(b1); uint32 loop = 64/maxbits; // Number of prime-powers we can accumulate inside inner loop while remaining < 2^64 uint64 tmp,prod,mult,cy = 0ull; - ASSERT(HERE, accum != 0x0, "Null accum[] pointer in s1_ppow_prod()"); - ASSERT(HERE, accum != 0x0, "Zero initial seed in s1_ppow_prod()"); + ASSERT(accum != 0x0, "Null accum[] pointer in s1_ppow_prod()"); + ASSERT(accum != 0x0, "Zero initial seed in s1_ppow_prod()"); accum[0] = iseed; len = 1; *nmul = 0; *maxmult = 0ull; // Debug-only - allows testing of S1 on known-factor case without actually running S2: #if 0 @@ -422,10 +422,10 @@ uint32 pm1_s1_ppow_prod(const uint64 iseed, const uint32 b1, uint64 accum[], uin mult = 140091319777ull; cy = mi64_mul_scalar(accum, mult, accum, len); ++*nmul; accum[len] = cy; len += (cy != 0ull); - fprintf(stderr,"Pre-loop accumulator = %llu + 2^64*%llu",accum[0],accum[1]); + fprintf(stderr,"Pre-loop accumulator = %" PRIu64 " + 2^64*%" PRIu64,accum[0],accum[1]); } #endif -// fprintf(stderr,"Stage 1 exponent = %llu.",accum[0]); +// fprintf(stderr,"Stage 1 exponent = %" PRIu64 ".",accum[0]); while(p < b1) { mult = 1ull; for(i = 0; i < loop; i++) { @@ -457,8 +457,8 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin uint8 c; uint32 i,j,b1 = 0,nbytes,nlimbs; uint64 itmp64 = 0ull,isum64 = 0ull; - ASSERT(HERE, arr != 0x0, "Null arr pointer!"); - ASSERT(HERE, strlen(fname) != 0, "Empty filename!"); + ASSERT(arr != 0x0, "Null arr pointer!"); + ASSERT(strlen(fname) != 0, "Empty filename!"); #ifdef PM1_STANDALONE FILE*fptr = 0x0; goto PM1_S1P_READ_RETURN; @@ -507,7 +507,7 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin } for(i = 0; i < nlimbs; i++) { itmp64 += arr[i]; } if(itmp64 != isum64) { - sprintf(cbuf, "INFO: %s: Computed checksum[%llX] mismatches one[%llX] appended to savefile data.\n",func,itmp64,isum64); + sprintf(cbuf, "INFO: %s: Computed checksum[%" PRIX64 "] mismatches one[%" PRIX64 "] appended to savefile data.\n",func,itmp64,isum64); *sum64 = 0ull; goto PM1_S1P_READ_RETURN; } else { @@ -529,13 +529,13 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin uint8 c; uint32 i,j,b1 = 0,nbytes,nlimbs; uint64 itmp64 = 0ull; - ASSERT(HERE, arr != 0x0, "Null arr pointer!"); - ASSERT(HERE, strlen(fname) != 0, "Empty filename!"); + ASSERT(arr != 0x0, "Null arr pointer!"); + ASSERT(strlen(fname) != 0, "Empty filename!"); FILE*fptr = mlucas_fopen(fname, "wb"); if(!fptr) { sprintf(cbuf,"ERROR: Unable to open precomputed p-1 stage 1 primes-product file %s for writing.\n",fname); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0, cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0, cbuf); } fprintf(stderr,"INFO: Opened precomputed p-1 stage 1 primes-product file %s for writing...\n",fname); @@ -561,7 +561,7 @@ int read_pm1_s1_prod(const char*fname, uint64 p, uint32*nbits, uint64 arr[], uin // Write 8 bytes of simple (sum of limbs, mod 2^64) checksum, after comparing arglist version to one computed from actual data: for(i = 0; i < nlimbs; i++) { itmp64 += arr[i]; } if(itmp64 != sum64) { - sprintf(cbuf, "INFO: %s: Computed checksum[%llX] mismatches one[%llX] in arglist.\n",func,itmp64,sum64); + sprintf(cbuf, "INFO: %s: Computed checksum[%" PRIX64 "] mismatches one[%" PRIX64 "] in arglist.\n",func,itmp64,sum64); goto PM1_S1P_WRITE_RETURN; } for(j = 0; j < 64; j += 8) { @@ -769,7 +769,7 @@ void pm1_bigstep_size(uint32*nbuf, uint32*bigstep, uint32*m, uint32 psmall) else if(psmall == 11) lut = lut_psmall11; else - ASSERT(HERE, 0, "pm1_bigstep_size: Bad input value of relocation-prime!"); + ASSERT(0, "pm1_bigstep_size: Bad input value of relocation-prime!"); // High-RAM case - For given D and associated num_b, M = floor(nbuf/num_b), where num_b = 24|40|48|80|96 // for D = 210|330|420|660|840. Only need to special-case psmall = 7 here, all others use D = 840: if(*nbuf >= 10000) { @@ -790,14 +790,14 @@ void pm1_bigstep_size(uint32*nbuf, uint32*bigstep, uint32*m, uint32 psmall) if(lut[i] > *nbuf) break; } if(!i) - ASSERT(HERE, 0, "P-1 stage 2 with relocation prime psmall = 7|11 needs at least 40|24 buffers of available RAM, respectively!"); + ASSERT(0, "P-1 stage 2 with relocation prime psmall = 7|11 needs at least 40|24 buffers of available RAM, respectively!"); if(psmall) { sprintf(cbuf,"Previous Stage 2 work used relocation-prime %u ... enforcing compatibility with this: bigstep must be a multiple of %u.\n",psmall,18-psmall); mlucas_fprint(cbuf,pm1_standlone+1); // Here's why we don't declare psmall const in the arglist - it stores the smallest prime which does // not divide the bigstep value, in order to check divisibility replace it by its complement here: psmall = 18-psmall; - ASSERT(HERE, (lut[i-1]%psmall == 0), "P-1 stage 2 needs at least 24 buffers of available RAM!"); + ASSERT((lut[i-1]%psmall == 0), "P-1 stage 2 needs at least 24 buffers of available RAM!"); /* First-go-round of this used just a single unified lut[] array and worked backward to the largest nbuf whoe D is compatible: for( ; ; i -= 2) { if(lut[i-1]%psmall == 0) break; @@ -839,7 +839,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow, #ifdef PM1_DEBUG uint32 j; double dsum; #endif - ASSERT(HERE,a && b && n && func_mod_square,"Null input pointer or vector length in pm1.c::modpow!"); + ASSERT(a && b && n && func_mod_square,"Null input pointer or vector length in pm1.c::modpow!"); // pow = 0: , b[1:n-1] = 0: if(!pow) { b[0] = 1.0; @@ -853,7 +853,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow, } // Init b = fwdFFT(a); only need this if power != 2^k, in which case we only need autosquarings: #ifdef PM1_DEBUG - fprintf(stderr,"MODPOW: pow = %llu\n",pow); + fprintf(stderr,"MODPOW: pow = %" PRIu64 "\n",pow); #endif if(!isPow2_64(pow)) { memcpy(b,a,nbytes); // b = a vvvv + 4 to effect "Do in-place forward FFT only; low bit = 0 here implies pure-int input" @@ -864,7 +864,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow, #ifdef PM1_DEBUG dsum = 0; for(j = 0; j < npad; j++) { dsum += fabs(b[j]); }; fprintf(stderr,"b = fwdFFT(a) gives b[0] = %20.8f, b[1] = %20.8f, L1(b) = %20.8f\n",b[0],b[1],dsum/n); MME = 0; #endif - } ASSERT(HERE, nerr == 0, "func_mod_square returns error!"); + } ASSERT(nerr == 0, "func_mod_square returns error!"); // Use LR binary modpow algorithm, though it's no faster in this general case than RL: uint32 len = nbits64(pow); pow = reverse64(pow,len)>>1; // Leftmost bit of input power accounted for by implied (result = a[]) init; @@ -904,7 +904,7 @@ int modpow(double a[], double b[], uint32 input_is_int, uint64 pow, } } // For initial release, no error handling - note we do have ROE handling in the main Stage 2 loop: - if(nerr != 0) { sprintf(cbuf,"modpow hit one or more errors! Aborting."); ASSERT(HERE,0,cbuf); } + if(nerr != 0) { sprintf(cbuf,"modpow hit one or more errors! Aborting."); ASSERT(0,cbuf); } // Result returned in a[]: return (nerr != 0); } @@ -1038,16 +1038,16 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, fprintf(stderr, "*** ERROR: Unrecognized flag %s.\n", stFlag); return 1; } } - ASSERT(HERE, bigstep && B1 && B2 && m, "All 4 args bigstep,b1,b2,m must be set > 0!"); + ASSERT(bigstep && B1 && B2 && m, "All 4 args bigstep,b1,b2,m must be set > 0!"); B2_start = (uint64)B1; #else // Check function pointer to [mers|fermat]_mod_square based on modulus type: if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) - ASSERT(HERE, func_mod_square == mers_mod_square , "Mod-square function pointer incorrectly set in pm1_stage2!"); + ASSERT(func_mod_square == mers_mod_square , "Mod-square function pointer incorrectly set in pm1_stage2!"); else if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) - ASSERT(HERE, func_mod_square == fermat_mod_square, "Mod-square function pointer incorrectly set in pm1_stage2!"); + ASSERT(func_mod_square == fermat_mod_square, "Mod-square function pointer incorrectly set in pm1_stage2!"); else - ASSERT(HERE,0,"Modulus type not set in pm1_stage2!"); + ASSERT(0,"Modulus type not set in pm1_stage2!"); #endif #ifndef PM1_STANDALONE @@ -1087,7 +1087,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, } else { // In the case of a standalone S2 interval (B2_small > B1), set psmall = 0 and reloc_start = UINT64_MAX: psmall = 0; reloc_start = -1ull; } - sprintf(cbuf,"Using B2_start = %llu, B2 = %llu, Bigstep = %u, M = %u\n",B2_start,B2,bigstep,m); + sprintf(cbuf,"Using B2_start = %" PRIu64 ", B2 = %" PRIu64 ", Bigstep = %u, M = %u\n",B2_start,B2,bigstep,m); mlucas_fprint(cbuf,pm1_standlone+1); uint32 reloc_on = FALSE; // Gets switched to TRUE (= start using semiprimes which are multiples of psmall) when q > reloc_start @@ -1099,8 +1099,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, tmp = (q0/(uint64)bigstep - 1)<<1; // tmp holds m_max as a uint64 // For this condition to be hit implies q0 quite small, but makes sure resulting m is 32-bit anyway: if((uint64)m < tmp) { - sprintf(cbuf, "Nonsensical value of M_max = %llu in qlo-underflow check ... aborting.",tmp); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + sprintf(cbuf, "Nonsensical value of M_max = %" PRIu64 " in qlo-underflow check ... aborting.",tmp); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } m = tmp-1; PM1_S2_NBUF = m*num_b; // Don't use PM1_S2_NBUF per se in code below, but reset for consistency @@ -1111,7 +1111,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, // May 2021: Added support for M even: m_is_odd = IS_ODD(m); m_is_even = !m_is_odd; - ASSERT(HERE, RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n"); // Need BASE_MULTIPLIER_BITS array = 0 for modmuls below! + ASSERT(RES_SHIFT == 0ull, "Shifted residues unsupported for p-1!\n"); // Need BASE_MULTIPLIER_BITS array = 0 for modmuls below! // Alloc the needed memory: #ifndef PM1_STANDALONE nlimb = (p+63+(MODULUS_TYPE == MODULUS_TYPE_FERMAT))>>6; // # of 64-bit limbs in p-bit vector, alloc 2 of these for debug: @@ -1132,20 +1132,20 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, j = 0; if(nalloc & 7) j = 8 - (nalloc & 7); - nalloc += j; ASSERT(HERE, (nalloc & 7) == 0,"nalloc must be a multiple of 8!"); // Ensure 64-byte alignment of a[] + nalloc += j; ASSERT((nalloc & 7) == 0,"nalloc must be a multiple of 8!"); // Ensure 64-byte alignment of a[] // double*a holds ptr to 1 scratch vector, double**buf holds ptrs to num_b*m double-vecs of same length npad: a_ptmp = ALLOC_DOUBLE(a_ptmp, nalloc); if(!a_ptmp){ sprintf(cbuf, "ERROR: unable to allocate the needed %u buffers of p-1 Stage 2 storage.\n",num_b*m + use_pp1); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } - a = ALIGN_DOUBLE(a_ptmp); ASSERT(HERE, ((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!"); + a = ALIGN_DOUBLE(a_ptmp); ASSERT(((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!"); buf = (double **)calloc(num_b*m,sizeof(double *)); // ...and num_b*m "buffers" for precomputed bigstep-coprime odd-square powers of the stage 1 residue: for(i = 0; i < num_b*m; i++) { buf[i] = a + i*npad; -// fprintf(stderr,"buf[%3d] = 0x%llX\n",i,(uint64)buf[i]); - ASSERT(HERE, ((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!"); +// fprintf(stderr,"buf[%3d] = %#" PRIX64 "\n",i,(uint64)buf[i]); + ASSERT(((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!"); } // Still do fwdFFT(1) as init-FFT step in non-(p+1) build, but use uppermost buf[] entry to hold as throwaway result: vone = a + (i - 1 + use_pp1)*npad; @@ -1168,8 +1168,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); const int nbytes_simd_align = (RE_IM_STRIDE*8) - 1; // And per-thread data chunk addresses with this to check SIMD alignment - ASSERT(HERE, ((intptr_t)mult[0] & nbytes_simd_align) == 0x0,"mult[0] not aligned on 64-byte boundary!"); - ASSERT(HERE, ((intptr_t)buf [0] & nbytes_simd_align) == 0x0,"buf [0] not aligned on 64-byte boundary!"); // Since npad a multiple of RE_IM_STRIDE, only need to check buf[0] alignment + ASSERT(((intptr_t)mult[0] & nbytes_simd_align) == 0x0,"mult[0] not aligned on 64-byte boundary!"); + ASSERT(((intptr_t)buf [0] & nbytes_simd_align) == 0x0,"buf [0] not aligned on 64-byte boundary!"); // Since npad a multiple of RE_IM_STRIDE, only need to check buf[0] alignment j = npad / NTHREADS; // j = #doubles in each thread-processed chunk /* Fiddle up-or-downward to make it a multiple of RE_IM_STRIDE; say this == 8. Since j == (npad/NTHREADS) - [0 or 1] due to truncation-on-integer-div, if jmod := (j % RE_IM_STRIDE) < RE_IM_STRIDE/2, subtract jmod from j, otherwise @@ -1190,15 +1190,15 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, tdat[i].n = j; // Chunksize } tdat[NTHREADS-1].n = npad - (NTHREADS-1)*j; // Fiddle the last thread's chunksize so the sum == npad - ASSERT(HERE, 0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, NTHREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(NTHREADS, MAX_THREADS, NTHREADS, &thread_control)), "threadpool_init failed!"); printf("%s: Init threadpool of %d threads\n",func,NTHREADS); #endif // PM1_STANDALONE? #endif // MULTITHREAD? // Integer arrays: - b = malloc(m*(bigstep>>1)*sizeof(uint32)); ASSERT(HERE, b != NULL, "B[]-array alloc failed!"); + b = malloc(m*(bigstep>>1)*sizeof(uint32)); ASSERT(b != NULL, "B[]-array alloc failed!"); /* Jun 2021: added (psmall) map words for psmall = (mod 7|11) bitmap needed to support small-prime relocation - optimization - This needs wsize bytes, hence the (...+1)*wsize: */ - map = calloc((m+2+1)*wsize,sizeof(uint8)); ASSERT(HERE, map != NULL, "map[]-array alloc failed!"); + map = calloc((m+2+1)*wsize,sizeof(uint8)); ASSERT(map != NULL, "map[]-array alloc failed!"); // 2 extra word-slots at high end of map used for these temps - can't declare as const pointers,0x but treat as such below: lo = map + m*wsize; hi = lo + wsize; rmap = hi + wsize; @@ -1266,7 +1266,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, retval = 1; goto ERR_RETURN; } for(j = 2*num_b; j < m*num_b; j++) { - ASSERT(HERE,b[j] == bigstep + b[j-2*num_b],"Bigstep-power-offset check fails!"); + ASSERT(b[j] == bigstep + b[j-2*num_b],"Bigstep-power-offset check fails!"); } #if !defined(PM1_STANDALONE) && defined(PM1_DEBUG) @@ -1282,17 +1282,17 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, ierr += func_mod_square(mult[1], 0x0, n, 0,1,(uint64)a + mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0);/* and done. */ if(ierr != 0) { sprintf(cbuf,"Modmul test hit an error of type = %u! Aborting.",ierr); - ASSERT(HERE,0,cbuf); + ASSERT(0,cbuf); } convert_res_FP_bytewise(mult[0],(uint8*)vec1, n, p, 0x0,0x0,0x0); convert_res_FP_bytewise(mult[1],(uint8*)vec2, n, p, 0x0,0x0,0x0); - ASSERT(HERE, mi64_cmp_eq(vec1,vec2,nlimb), "Modmul-test results mismatch!"); + ASSERT(mi64_cmp_eq(vec1,vec2,nlimb), "Modmul-test results mismatch!"); /********************************************************************************/ /********* Known-stage-2-factor tests, starting with a stage 1 residue: *********/ /********************************************************************************/ // F31: Do a single stage-1-result-powering (pow^140091319777 - 1) and make sure the known factor divides the result: if(p == 2147483648) { - ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!"); + ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!"); input_is_int = TRUE; memcpy(a,pow,nbytes); modpow(a, mult[0], input_is_int, 140091319777ull, func_mod_square, p, n, scrnFlag,&tdif2); @@ -1304,14 +1304,14 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, // In fact, F31 has nlimb+1 words, but the only way a p-1 residue R has the same high bit // set as F31 iif R == F31 (uninteresting) or R == 2^2^31, which implies GCD == 1: int isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem); - ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!"); + ASSERT(isfact != 0, "Failed to find known stage 2 factor!"); fprintf(stderr,"%s p-1 known-stage-2 prime stage 1 powering success!\n",PSTRING); } // M(139788679): Do a stage-1-result-powering (pow^a - 1) with a = 9952471 and make sure the corresponding // known factor, q = 842944537391616 = 2.k.p+1 with k = 2^9.3^2.11^2.29.37.1187^2, divides the result. // With B1 < 1187^2 = 1408969 this factor is not found after stage 1 since this prime appears only as a single-power: if(p == 139788679) { - ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!"); + ASSERT(MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!"); // A^4002923: Use mult[0] as scratch array for modpow(): input_is_int = TRUE; memcpy(a,pow,nbytes); @@ -1322,7 +1322,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, convert_res_FP_bytewise(a,(uint8*)vec1, n, p, 0x0,0x0,0x0); uint64 rem[2] = {0ull,0ull}, q[2] = {11051162840690736129ull,12775ull}; // q = 1314651028704963254300497 int isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem); - ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!"); + ASSERT(isfact != 0, "Failed to find known stage 2 factor!"); fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING); exit(0); } @@ -1372,7 +1372,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, */ vone[0] = 1.0; for(j = 1; j < n; j++) { vone[j] = 0.0; } //vvv-- Pure-int inputs, so mode_flag = 0 - ierr = func_mod_square(vone, (void *)a, n, 0,1, 4ull, p, scrnFlag,&tdif2, FALSE, 0x0); ASSERT(HERE, ierr == 0,"fwdFFT(1) hit error!"); + ierr = func_mod_square(vone, (void *)a, n, 0,1, 4ull, p, scrnFlag,&tdif2, FALSE, 0x0); ASSERT(ierr == 0,"fwdFFT(1) hit error!"); #if !USE_PP1_MULTS // Basic version: @@ -1413,8 +1413,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, #ifdef PM1_DEBUG fprintf(stderr,"%u^2.",j); #endif -// fprintf(stderr,"buf[%3d] = 0x%llX\n",i,(uint64)buf[i]); - ASSERT(HERE, ((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!"); +// fprintf(stderr,"buf[%3d] = %#" PRIX64 "\n",i,(uint64)buf[i]); + ASSERT(((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!"); memcpy(buf[i++],mult[0],nbytes); // buf[i++] = mult[0] = fwd-FFT-pass-1-done(A^1,9,25,...) } // Up-multiply the fwd-FFT-pass-1-done(A^8,16,24,...) by fixed multiplier fwd-FFT(A^8): @@ -1430,7 +1430,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound, // vec1,vec2 will hold stage 1 residue A (stored in FP form in pow[]) and its mod-inverse in packed-bit form vec1[nlimb-1] = 0ull; convert_res_FP_bytewise(pow,(uint8*)vec1, n, p, &Res64,&Res35m1,&Res36m1); -fprintf(stderr,"#1: vec1 = A^+1 checksums = %llu,%llu,%llu; FP(A)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, pow[0],pow[1]); +fprintf(stderr,"#1: vec1 = A^+1 checksums = %" PRIu64 ",%" PRIu64 ",%" PRIu64 "; FP(A)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, pow[0],pow[1]); // First see if there's a savefile-copy of the s1 residue-inverse: strcpy(inv_file, RESTARTFILE); inv_file[0] = ((MODULUS_TYPE == MODULUS_TYPE_MERSENNE) ? 'p' : 'f'); @@ -1440,20 +1440,20 @@ fprintf(stderr,"#1: vec1 = A^+1 checksums = %llu,%llu,%llu; FP(A)[0:1] = %10.2f, if(fp) { // G-check residue fields all set NULL in this call: i = read_ppm1_savefiles(inv_file, p, &k, fp, &tmp, (uint8*)vec2, &Res64,&Res35m1,&Res36m1, 0x0,0x0,0x0,0x0); fclose(fp); fp = 0x0; - ASSERT(HERE, tmp == 0ull, "Stage 1 residue-inverse savefile should have nsquares == 0!"); + ASSERT(tmp == 0ull, "Stage 1 residue-inverse savefile should have nsquares == 0!"); if(!i) { /* First print any error message that may have been issued during the above function call: */ if(strstr(cbuf, "read_ppm1_savefiles")) mlucas_fprint(cbuf,pm1_standlone+1); // And now for the official spokesmessage: - snprintf_nowarn(cbuf,STR_MAX_LEN, "Read of stage 1 residue-inverse savefile %s failed for reasons unknown. Computing inverse...\n",inv_file); + snprintf(cbuf,STR_MAX_LEN*2, "Read of stage 1 residue-inverse savefile %s failed for reasons unknown. Computing inverse...\n",inv_file); mlucas_fprint(cbuf,pm1_standlone+1); } else { s1_inverse = TRUE; } } if(!s1_inverse) { - snprintf(cbuf,STR_MAX_LEN, "Stage 2: Computing mod-inverse of Stage 1 residue...\n"); mlucas_fprint(cbuf,pm1_standlone+1); + snprintf(cbuf,STR_MAX_LEN*2, "Stage 2: Computing mod-inverse of Stage 1 residue...\n"); mlucas_fprint(cbuf,pm1_standlone+1); modinv(p,vec1,vec2,nlimb); // Result in vec2 Res64 = vec2[0]; Res35m1 = mi64_div_by_scalar64(vec2,two35m1,nlimb,0x0); @@ -1464,12 +1464,12 @@ fprintf(stderr,"#1: vec1 = A^+1 checksums = %llu,%llu,%llu; FP(A)[0:1] = %10.2f, write_ppm1_savefiles(inv_file,p,n,fp, 0ull, (uint8*)vec2,Res64,Res35m1,Res36m1, 0x0,0x0,0x0,0x0); fclose(fp); fp = 0x0; } else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open restart file %s for write of checkpoint data.\n",inv_file); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open restart file %s for write of checkpoint data.\n",inv_file); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } } convert_res_bytewise_FP((uint8*)vec2, a, n, p); // Use a[] to hold inverse A^-1 until done with it -fprintf(stderr,"#1: vec2 = A^-1 checksums = %llu,%llu,%llu; FP(A^-1)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, a[0],a[1]); +fprintf(stderr,"#1: vec2 = A^-1 checksums = %" PRIu64 ",%" PRIu64 ",%" PRIu64 "; FP(A^-1)[0:1] = %10.2f,%10.2f\n",Res64,Res35m1,Res36m1, a[0],a[1]); #ifdef PM1_DEBUG fprintf(stderr,"Checking mod-inverse...\n"); // Debug: check inverse ... start with copies of A (pow[]) and A^-1 (a[]) into mult0-1: @@ -1479,8 +1479,8 @@ fprintf(stderr,"#1: vec2 = A^-1 checksums = %llu,%llu,%llu; FP(A^-1)[0:1] = %10. // mult[0] = A * A^-1, check that result = 1 as expected: mode_flag = 0; // bits 0:1 of mode_flag = 0, since mult[0] enters in pure-int form and want output the same way ierr += func_mod_square(mult[0], 0x0, n, 0,1, (uint64)mult[1] + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); - ASSERT(HERE, mult[0][0] == 1.0, "inverse-check fails!"); - for(i = 1; i < npad; i++) { ASSERT(HERE, mult[0][i] == 0.0, "inverse-check fails!"); } + ASSERT(mult[0][0] == 1.0, "inverse-check fails!"); + for(i = 1; i < npad; i++) { ASSERT(mult[0][i] == 0.0, "inverse-check fails!"); } #endif /*** NOTE: vec1,vec2 hold stage 1 residue A and its mod-inverse in packed-bit form, can use any of pow[], mult[0-2][], a[] in our ensuing stage 2 inits and still re-obtain b and b^-1 from vec1,2 at any time. ***/ @@ -1502,19 +1502,19 @@ fprintf(stderr,"#1: vec2 = A^-1 checksums = %llu,%llu,%llu; FP(A^-1)[0:1] = %10. // limb than p>>6, so shorten sub-vec-length by 1 to make sure any carry ends up in tmp, not vec2[nlimb-1] // Sign of wraparound carry depends on modulus type: if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { - ASSERT(HERE,tmp == 0ull,"Mersenne-mod vec1+vec2 should never spill over into next word!"); + ASSERT(tmp == 0ull,"Mersenne-mod vec1+vec2 should never spill over into next word!"); // Now get the really carry bit, bit p. Normalized residue only uses bits <0:p-1>: bit = p&63; word = p>>6; tmp = vec2[word]>>bit; q = mi64_add_scalar(vec2, tmp, vec2, nlimb); - ASSERT(HERE,q == 0ull,"Mersenne-mod vec1+vec2 wraparound carry should never have carry-out!"); + ASSERT(q == 0ull,"Mersenne-mod vec1+vec2 wraparound carry should never have carry-out!"); } else { q = mi64_sub_scalar(vec2, tmp, vec2, nlimb-1); // Again shorten vec-sub-length by 1 to properly check for borrow-out - ASSERT(HERE,q == 0ull,"Fermat-mod vec1+vec2 wraparound carry should never have borrow-out!"); + ASSERT(q == 0ull,"Fermat-mod vec1+vec2 wraparound carry should never have borrow-out!"); } convert_res_bytewise_FP((uint8*)vec2,buf[0], n, p); // buf[0] = V[1] // Now recover original vec2 = A^-1 from the FP version in a[]: convert_res_FP_bytewise(a,(uint8*)vec2, n, p, &Res64, &Res35m1, &Res36m1); -fprintf(stderr,"#2: vec2 = A^-1 checksums = %llu,%llu,%llu\n",Res64,Res35m1,Res36m1); +fprintf(stderr,"#2: vec2 = A^-1 checksums = %" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",Res64,Res35m1,Res36m1); #else // Original FP code for V[1] lacks post-add normalization: for(i = 0; i < npad; i++) { buf[0][i] = pow[i] + a[i]; } // V[1] = A^1 + A^-1 @@ -1677,7 +1677,7 @@ MME = 0; } else { tmp = vec1[nlimb-1]; } - ASSERT(HERE,tmp == 0ull,"Properly normalized residue should never spill over into next word!"); + ASSERT(tmp == 0ull,"Properly normalized residue should never spill over into next word!"); #endif // PM1_DEBUG? #endif // USE_PP1_MULTS? @@ -1687,18 +1687,18 @@ MME = 0; #endif if(nerr != 0) { sprintf(cbuf,"Stage 2 buffer-init hit 1 or more fatal errors! Aborting."); - mlucas_fprint(cbuf,pm1_standlone+0); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+0); ASSERT(0,cbuf); } if(i != m*num_b) { sprintf(cbuf,"Stage 2: Incorrect loop-exit value of buffer-index!"); - mlucas_fprint(cbuf,pm1_standlone+0); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+0); ASSERT(0,cbuf); } // buf[] entries all need to be rest-of-fwd-FFTed; for(i = 0; i < m*num_b; i++) { // Since buf[0] holds pure-int copy of stage 1 residue A on loop entry, bit 0 of mode_flag = 0 for just it: // vvvvvvvv ierr = func_mod_square(buf[i], 0x0, n, 0,1, 4ull + (uint64)(mode_flag - (i==0)), p, scrnFlag,&tdif2, FALSE, 0x0); nerr += ierr; - } ASSERT(HERE, nerr == 0, "fwdFFT of buf[] entries returns error!"); + } ASSERT(nerr == 0, "fwdFFT of buf[] entries returns error!"); // Accumulate the cycle count in a floating double on each pass to avoid problems // with integer overflow of the clock() result, if clock_t happens to be 32-bit int on the host platform: @@ -1708,7 +1708,7 @@ MME = 0; clock2 = getRealTime(); #endif *tdiff = clock2 - clock1; clock1 = clock2; - snprintf_nowarn(cbuf,STR_MAX_LEN, "Buffer-init done; clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME); + snprintf(cbuf,STR_MAX_LEN*2, "Buffer-init done; clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME); mlucas_fprint(cbuf,pm1_standlone+1); /********************* RESTART FILE STUFF: **********************/ @@ -1726,7 +1726,7 @@ MME = 0; if(i && psmall) { // We expect the main-program S2-invocation code to have resolved this kind of mismatch via bigstep selection: if(qlo >> 56) - ASSERT(HERE, (uint32)(qlo >> 56) == psmall, "Mismatch between relocation-prime set for stage 2 restart and one read from S2 savefile!"); + ASSERT((uint32)(qlo >> 56) == psmall, "Mismatch between relocation-prime set for stage 2 restart and one read from S2 savefile!"); } qlo &= 0x00ffffffffffffffull; // Mask off high byte storing psmall // If savefile-read fails, start stage 2 from B2_start: @@ -1739,9 +1739,9 @@ MME = 0; } // If nsquares > B2_start, arrtmp holds the S2 interim residue for q = nsquares; set up to restart S2 at that point. if(qlo >= B2_start) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "Read stage 2 savefile %s ... restarting stage 2 from q = %llu.\n",savefile,qlo); + snprintf(cbuf,STR_MAX_LEN*2, "Read stage 2 savefile %s ... restarting stage 2 from q = %" PRIu64 ".\n",savefile,qlo); } else { // If user running a new partial S2 interval with bounds larger than a previous S2 run, allow but info-print to that effect: - snprintf_nowarn(cbuf,STR_MAX_LEN, "INFO: %s savefile has qlo[%llu] <= B2_start[%llu] ... Stage 2 interval will skip intervening primes.\n",func,qlo,B2_start); + snprintf(cbuf,STR_MAX_LEN*2, "INFO: %s savefile has qlo[%" PRIu64 "] <= B2_start[%" PRIu64 "] ... Stage 2 interval will skip intervening primes.\n",func,qlo,B2_start); } mlucas_fprint(cbuf,pm1_standlone+1); restart = TRUE; @@ -1758,7 +1758,7 @@ MME = 0; if(!qlo) { // If qlo unset, set = default stage 2 starting point ... if qlo already set via restart-file qlo = B2_start; // read, it will automatically be > our small-prime-relocation-reflecting value of B2_start. if(psmall && B2_start > B1) { // If psmall = 0, it's an S2 continuation run, no relocation done - sprintf(cbuf,"Small-prime[%u] relocation: will start Stage 2 at bound %llu\n",psmall,qlo); + sprintf(cbuf,"Small-prime[%u] relocation: will start Stage 2 at bound %" PRIu64 "\n",psmall,qlo); mlucas_fprint(cbuf,pm1_standlone+1); } } @@ -1774,9 +1774,9 @@ MME = 0; k = k0 = q0/bigstep; // Now set k to its 'real' value if((uint64)k*bigstep != q0) { sprintf(cbuf,"k must be 32-bit!"); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } - sprintf(cbuf,"Stage 2 q0 = %llu, k0 = %u\n",q0,k0); + sprintf(cbuf,"Stage 2 q0 = %" PRIu64 ", k0 = %u\n",q0,k0); mlucas_fprint(cbuf,pm1_standlone+1); /* Expanded-match-window scheme needs us to precompute singleton-prime-q's bitmap corresponding to M intervals @@ -1804,7 +1804,7 @@ MME = 0; We only start actual 0-interval and extended-window pairing when said interval has shifted to the middle of the extended pairing window, i.e. is the 0-interval (M odd), or shifted just left of the map midpoint (M even): */ - ASSERT(HERE, q0 > (m2+1)*(uint64)bigstep, "ERROR: qlo underflows in p-1 stage 2."); + ASSERT(q0 > (m2+1)*(uint64)bigstep, "ERROR: qlo underflows in p-1 stage 2."); qlo = q0 - (m2+1)*(uint64)bigstep; /* [c] Our A^(a^2) values = A^((k*D)^2) and we'll be incrementing k between sweeps over the set of b's. @@ -1813,7 +1813,7 @@ MME = 0; */ // At this point pow = A[stage 1 residue]; need either A^(D^2) or (A^D + A^-D), where D = bigstep: #ifndef PM1_STANDALONE - snprintf_nowarn(cbuf,STR_MAX_LEN, "Computing Stage 2 loop-multipliers...\n"); mlucas_fprint(cbuf,pm1_standlone+1); + snprintf(cbuf,STR_MAX_LEN*2, "Computing Stage 2 loop-multipliers...\n"); mlucas_fprint(cbuf,pm1_standlone+1); MME = 0.0; // Reset maxROE // Raise A to power D^2, using mult[0] as a scratch array; again crap-API forces us to specify an "input is pure-int?" flag: input_is_int = TRUE; @@ -1894,9 +1894,9 @@ MME = 0; // mult[0] = A^+D * A^-D, check that result = 1 as expected: mode_flag = 1; // bits 0:1 of mode_flag = 1,0, since mult[2] enters in fwd-FFT-pass-1-done form and want output in pure-int form ierr += func_mod_square(mult[2], 0x0, n, 0,1, (uint64)a + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); - ASSERT(HERE, mult[2][0] == 1.0, "inverse-check fails!"); + ASSERT(mult[2][0] == 1.0, "inverse-check fails!"); for(i = 1; i < npad; i++) { - ASSERT(HERE, mult[2][i] == 0.0, "inverse-check fails!"); + ASSERT(mult[2][i] == 0.0, "inverse-check fails!"); } fprintf(stderr,"A^-D inverse check passed ... exiting.\n"); exit(0); // Since above debug overwrites mult[2], must quit. @@ -1924,9 +1924,9 @@ MME = 0; tmp = vec2[nlimb-1]; q = mi64_sub_scalar(vec2, tmp, vec2, nlimb-1); // Again shorten vec-sub-length by 1 to properly check for borrow-out } - ASSERT(HERE,q == 0ull,"Properly normalized vec1+vec2 wraparound carry should never have borrow-out!"); + ASSERT(q == 0ull,"Properly normalized vec1+vec2 wraparound carry should never have borrow-out!"); // Now compare to alternate-path V[D] computed in above buffer-init code: - ASSERT(HERE, mi64_cmp_eq(vec1,vec2,nlimb), "V[D] results mismatch!"); + ASSERT(mi64_cmp_eq(vec1,vec2,nlimb), "V[D] results mismatch!"); fprintf(stderr,"V[D] cross-check passed ... exiting.\n"); exit(0); // Since above debug overwrites a[], must quit. #endif // PM1_DEBUG? @@ -1942,9 +1942,9 @@ MME = 0; // pow[] = A^+((k0-1)*D) * A^-((k0-1)*D), check that result = 1 as expected: mode_flag = 1; // bits 0:1 of mode_flag = 1,0, since pow[] enters in fwd-FFT-pass-1-done form and want output in pure-int form ierr += func_mod_square(pow, 0x0, n, 0,1, (uint64)mult[1] + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); - ASSERT(HERE, pow[0] == 1.0, "A^-(k0-1)*D inverse-check fails!"); + ASSERT(pow[0] == 1.0, "A^-(k0-1)*D inverse-check fails!"); for(i = 1; i < npad; i++) { - ASSERT(HERE, pow[i] == 0.0, "A^-(k0-1)*D inverse-check fails!"); + ASSERT(pow[i] == 0.0, "A^-(k0-1)*D inverse-check fails!"); } fprintf(stderr,"A^-(k0-1)*D inverse check passed ... exiting.\n"); exit(0); // Since above debug overwrites pow[], must quit. @@ -1973,9 +1973,9 @@ MME = 0; ierr += func_mod_square(mult[0], 0x0, n, 0,1, 4ull + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); mode_flag = 1; // bits 0:1 of mode_flag = 1,0, since pow[] enters in fwd-FFT-pass-1-done form and want output in pure-int form ierr += func_mod_square(pow, 0x0, n, 0,1, (uint64)mult[0] + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); - ASSERT(HERE, pow[0] == 1.0, "A^-k0*D inverse-check fails!"); + ASSERT(pow[0] == 1.0, "A^-k0*D inverse-check fails!"); for(i = 1; i < npad; i++) { - ASSERT(HERE, pow[i] == 0.0, "A^-k0*D inverse-check fails!"); + ASSERT(pow[i] == 0.0, "A^-k0*D inverse-check fails!"); } fprintf(stderr,"A^-k0*D inverse check passed ... exiting.\n"); exit(0); // Since above debug overwrites pow[], must quit. @@ -1990,8 +1990,8 @@ MME = 0; if(restart) { // If restart, convert bytewise-residue S2 accumulator read from file to floating-point form: if(!convert_res_bytewise_FP((uint8*)arrtmp, pow, n, p)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",savefile); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on primality-test residue read from savefile %s!\n",savefile); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } // Restart-file-read S2 interim residue in pow[] needs fwd-weight and FFT-pass1-done: ierr = func_mod_square(pow, 0x0, n, 0,1, -4ull, p, scrnFlag,&tdif2, FALSE, 0x0); @@ -2003,8 +2003,8 @@ MME = 0; #if 0 // A: No, because pow = A^(k0*D) + A^-(k0*D) is perfectly fine as S2 init-accumulator vec1[nlimb-1] = 0ull; if(!convert_res_bytewise_FP((uint8*)vec1, pow, n, p)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: convert_res_bytewise_FP Failed on S1 residue in vec1!\n"); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: convert_res_bytewise_FP Failed on S1 residue in vec1!\n"); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } // Pure-int S1 residue in pow[] needs fwd-weight and FFT-pass1-done: ierr = func_mod_square(pow, 0x0, n, 0,1, -4ull, p, scrnFlag,&tdif2, FALSE, 0x0); @@ -2021,7 +2021,7 @@ MME = 0; ierr = func_mod_square(mult[2], 0x0, n, 0,1, 4ull + (uint64)mode_flag, p, scrnFlag,&tdif2, FALSE, 0x0); nerr += ierr; if(nerr != 0) { sprintf(cbuf,"Stage 2 loop-multipliers computation hit one or more fatal errors! Aborting."); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } #ifdef CTIME clock2 = clock(); @@ -2029,7 +2029,7 @@ MME = 0; clock2 = getRealTime(); #endif *tdiff = clock2 - clock1; clock1 = clock2; - snprintf_nowarn(cbuf,STR_MAX_LEN, "Stage 2 loop-multipliers: clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME); + snprintf(cbuf,STR_MAX_LEN*2, "Stage 2 loop-multipliers: clocks =%s, MaxErr = %10.9f.\n",get_time_str(*tdiff), MME); mlucas_fprint(cbuf,pm1_standlone+1); *tdiff = AME = MME = 0.0; // Reset timer and maxROE, now also init AvgROE AME_ITER_START = 0; // For p-1 stage 2, start collecting AvgROE data immediately, no need t wait for residue to "fill in" @@ -2050,7 +2050,7 @@ MME = 0; { if(!reloc_on && q >= reloc_start) { // Start including relocation-semiprimes once S@ passes this point reloc_on = TRUE; - sprintf(cbuf,"Hit q = %llu >= reloc_start[%llu] ... enabling small-prime relocation.\n",q,reloc_start); + sprintf(cbuf,"Hit q = %" PRIu64 " >= reloc_start[%" PRIu64 "] ... enabling small-prime relocation.\n",q,reloc_start); mlucas_fprint(cbuf,pm1_standlone+1); } // Only start actual 0-interval and extended-window pairing when q hits q0: @@ -2061,7 +2061,7 @@ MME = 0; center of the expanded-match window, recompute same batch of stage 2 powering pairs q1,q2[i] = k*D +- b[i] and process the both-q1-and-q2-prime pairs: */ #ifdef PM1_DEBUG - fprintf(stderr,"k = %u: q = %llu\n",k,q); + fprintf(stderr,"k = %u: q = %" PRIu64 "\n",k,q); fprintf(stderr,"Processing 0-interval prime pairs:\n"); #endif map_lo = map + m2*wsize + (wsize>>1); // ptr to midpoint of 0-interval map word @@ -2071,9 +2071,9 @@ MME = 0; #ifdef PM1_DEBUG q1 = q - b[i]; q2 = q + b[i]; bit = pprimeF64(q1,2ull); if(p1 != bit) - fprintf(stderr,"Mismatch: q1 = %llu[%u], bytevec_test_bit returns %u\n",q1,bit,p1); + fprintf(stderr,"Mismatch: q1 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q1,bit,p1); bit = pprimeF64(q2,2ull); if(p2 != bit) - fprintf(stderr,"Mismatch: q2 = %llu[%u], bytevec_test_bit returns %u\n",q2,bit,p2); + fprintf(stderr,"Mismatch: q2 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q2,bit,p2); #endif // Skip a given value of i if one or both of q1,q2[i] are composite according to a 2-prp test: j = p1+p2; @@ -2083,12 +2083,12 @@ MME = 0; if(j < m) continue; #ifdef PM1_DEBUG - fprintf(stderr,"\tq1 = %llu[%u], q2 = %llu[%u], 1-prime\n",q1,p1,q2,p2); + fprintf(stderr,"\tq1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], 1-prime\n",q1,p1,q2,p2); #endif ns++; } else { #ifdef PM1_DEBUG - fprintf(stderr,"\tq1 = %llu[%u], q2 = %llu[%u], both prime\n",q1,p1,q2,p2); + fprintf(stderr,"\tq1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], both prime\n",q1,p1,q2,p2); #endif np++; } @@ -2143,16 +2143,16 @@ MME = 0; p1 = bytevec_test_bit(map_lo,j); p2 = bytevec_test_bit(map_hi,j); q1 = q - b[tmp+j]; q2 = q + b[tmp+j]; bit = pprimeF64(q1,2ull); if(p1 != bit) - fprintf(stderr,"Mismatch: q1 = %llu[%u], bytevec_test_bit returns %u\n",q1,bit,p1); + fprintf(stderr,"Mismatch: q1 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q1,bit,p1); bit = pprimeF64(q2,2ull); if(p2 != bit) - fprintf(stderr,"Mismatch: q2 = %llu[%u], bytevec_test_bit returns %u\n",q2,p2,bit); + fprintf(stderr,"Mismatch: q2 = %" PRIu64 "[%u], bytevec_test_bit returns %u\n",q2,p2,bit); #endif // For thus-paired prime-q's, update stage 2 accumulator: p1 = bytevec_test_bit(lo,j); if(p1) { np++; #ifdef PM1_DEBUG - fprintf(stderr,"\tq = %llu -+ %u: q1 = %llu[%u], q2 = %llu[%u], paired singles\n",q,b[tmp+j],q-b[tmp+j],p1,q+b[tmp+j],p2); + fprintf(stderr,"\tq = %" PRIu64 " -+ %u: q1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], paired singles\n",q,b[tmp+j],q-b[tmp+j],p1,q+b[tmp+j],p2); #endif #ifndef PM1_STANDALONE #ifdef USE_VEC_DBL_SUB @@ -2194,7 +2194,7 @@ MME = 0; #ifdef PM1_DEBUG q1 = q-b[tmp+i]; q2 = q+b[tmp+i]; p1 = pprimeF64(q1,2ull); p2 = pprimeF64(q2,2ull); // Run q1,q2 through a base-2 Fermat-composite test - fprintf(stderr,"\tq = %llu -+ %u: q1 = %llu[%u], q2 = %llu[%u], 1-prime\n",q,b[tmp+i],q1,p1,q2,p2); + fprintf(stderr,"\tq = %" PRIu64 " -+ %u: q1 = %" PRIu64 "[%u], q2 = %" PRIu64 "[%u], 1-prime\n",q,b[tmp+i],q1,p1,q2,p2); #endif #ifndef PM1_STANDALONE #ifdef USE_VEC_DBL_SUB @@ -2247,7 +2247,7 @@ MME = 0; Working leftward from right endpoint of interval, init single 2*num_b-bit accumulator. */ #ifdef PM1_DEBUG - fprintf(stderr,"New upper-interval with q0 = %llu, tagging its primes:\n",tmp); + fprintf(stderr,"New upper-interval with q0 = %" PRIu64 ", tagging its primes:\n",tmp); #endif /* Prime relocation: Illustrate using psmall = 11, but analogous pattering holds for psmall = 7 (D = 330|660): @@ -2309,13 +2309,13 @@ MME = 0; if(bytevec_test_bit(rmap,j )) { q1 *= pinv64; #ifdef PM1_DEBUG - fprintf(stderr,"reloc q1: %llu => %llu\n",q1,q1*psmall); + fprintf(stderr,"reloc q1: %" PRIu64 " => %" PRIu64 "\n",q1,q1*psmall); #endif } if(bytevec_test_bit(rmap,i+num_b)) { q2 *= pinv64; #ifdef PM1_DEBUG - fprintf(stderr,"reloc q2: %llu => %llu\n",q2,q2*psmall); + fprintf(stderr,"reloc q2: %" PRIu64 " => %" PRIu64 "\n",q2,q2*psmall); #endif } } @@ -2336,7 +2336,7 @@ MME = 0; if(psmall && bytevec_test_bit(rmap,j)) { q1 *= pinv64; #ifdef PM1_DEBUG - fprintf(stderr,"reloc q: %llu => %llu\n",q1,q1*psmall); + fprintf(stderr,"reloc q: %" PRIu64 " => %" PRIu64 "\n",q1,q1*psmall); #endif } p1 = pprimeF64(q1,2ull); @@ -2446,7 +2446,7 @@ MME = 0; strftime(timebuffer,SIZE,"%Y-%m-%d %H:%M:%S",local_time); AME /= (nmodmul - nmodmul_save); // Print [date in hh:mm:ss | p | stage progress | %-complete | time | per-iter time | Res64 | max ROE: - snprintf_nowarn(cbuf,STR_MAX_LEN, "[%s] %s %s = %llu [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016llX. AvgMaxErr = %10.9f. MaxErr = %10.9f.\n" + snprintf(cbuf,STR_MAX_LEN*2, "[%s] %s %s = %" PRIu64 " [%5.2f%% complete] clocks =%s [%8.4f msec/iter] Res64: %016" PRIX64 ". AvgMaxErr = %10.9f. MaxErr = %10.9f.\n" , timebuffer, PSTRING, "S2 at q", q+bigstep, (float)(q-B2_start)/(float)(B2-B2_start) * 100,get_time_str(*tdiff) , 1000*get_time(*tdiff)/(nmodmul - nmodmul_save), Res64, AME, MME); mlucas_fprint(cbuf,pm1_standlone+scrnFlag); @@ -2458,8 +2458,8 @@ MME = 0; write_ppm1_savefiles(savefile,p,n,fp, ((uint64)psmall<<56) + q + bigstep, (uint8*)arrtmp,Res64,Res35m1,Res36m1, 0x0,0x0,0x0,0x0); fclose(fp); fp = 0x0; } else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "ERROR: unable to open restart file %s for write of checkpoint data.\n",savefile); - mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2, "ERROR: unable to open restart file %s for write of checkpoint data.\n",savefile); + mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(0,cbuf); } // If interim-GCDs enabled (default) and latest S2 interval crossed a 10M mark, take a GCD; if factor found, early-return; if(interim_gcd) { @@ -2481,7 +2481,7 @@ MME = 0; } #endif // #ifndef PM1_STANDALONE } // endfor(q = qlo; q < qhi; q += bigstep) - ASSERT(HERE, nerr == 0, "Stage 2 loop hit a modmul error!"); + ASSERT(nerr == 0, "Stage 2 loop hit a modmul error!"); #ifndef PM1_STANDALONE // Need to undo pass 1 of fwd-FFT on loop-exit; do this just as with fwd-FFT-only, but with flag = 8 instead of 4: ierr = func_mod_square(pow, 0x0, n, 0,1, 8ull, p, scrnFlag,&tdif2, FALSE, 0x0); @@ -2491,40 +2491,40 @@ MME = 0; #endif // (k - k0) = #bigstep-blocks (passes thru above loop) used in stage 2; np + ns + 2*(k - k0) = #modmul: nmodmul = np + ns + 2*(k - k0); // This is actually redundant, but just to spell it out - snprintf(cbuf,STR_MAX_LEN,"M = %2u: #buf = %4u, #pairs: %u, #single: %u (%5.2f%% paired), #blocks: %u, #modmul: %u\n",m,m*num_b,np,ns,100.0*2*np/(2*np+ns),k-k0,nmodmul); + snprintf(cbuf,STR_MAX_LEN*2,"M = %2u: #buf = %4u, #pairs: %u, #single: %u (%5.2f%% paired), #blocks: %u, #modmul: %u\n",m,m*num_b,np,ns,100.0*2*np/(2*np+ns),k-k0,nmodmul); mlucas_fprint(cbuf,pm1_standlone+1); #ifndef PM1_STANDALONE #ifdef PM1_DEBUG #warning Revert this preprocessor flag! - fprintf(stderr,"Res64 = 0x%016llX; clocks =%s, MaxErr = %10.9f\n",arrtmp[0],get_time_str(*tdiff),MME); + fprintf(stderr,"Res64 = %#016" PRIX64 "; clocks =%s, MaxErr = %10.9f\n",arrtmp[0],get_time_str(*tdiff),MME); if(p == 33554432) { // F25: check if the known factor divides the S2 result: - ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!"); + ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!"); int isfact = mi64_is_div_by_scalar64(arrtmp,2170072644496392193ull,nlimb); // k = 2^5.3^2.37.997.11066599 - ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!"); + ASSERT(isfact != 0, "Failed to find known stage 2 factor!"); fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING); } if(p == 108268067) { - ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!"); + ASSERT(MODULUS_TYPE == MODULUS_TYPE_MERSENNE, "This p-1 self-test requires Mersenne-mod mode!"); uint64 rem[2] = {0ull,0ull}, q[2] = {11943519037290122063ull,18561975ull}; // k = 7.17.19.61.294313.38955941; q = k.2^p + 1 int isfact = mi64_div(arrtmp,q, nlimb,2, 0x0, rem); - ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!"); + ASSERT(isfact != 0, "Failed to find known stage 2 factor!"); fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING); } if(p == 2147483648) { // F31: check if the known factor divides the S2 result: - ASSERT(HERE, MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!"); + ASSERT(MODULUS_TYPE == MODULUS_TYPE_FERMAT, "This p-1 self-test requires Fermat-mod mode!"); uint64 rem[2] = {0ull,0ull}, q[2] = {3118754346955702273ull,2544ull}; // k = 3.13.140091319777; q = k.2^(m+2) + 1 int isfact = mi64_div(arrtmp,q, nlimb,2, 0x0, rem); - ASSERT(HERE, isfact != 0, "Failed to find known stage 2 factor!"); + ASSERT(isfact != 0, "Failed to find known stage 2 factor!"); fprintf(stderr,"%s p-1 known-stage-2 prime self-test success!\n",PSTRING); } #endif // PM1_DEBUG // In case of normal (non-early) return, caller will handle the GCD: if(strlen(gcd_str)) { - snprintf_nowarn(cbuf,STR_MAX_LEN, "Stage 2 early-return due to factor found; MaxErr = %10.9f.\n",MME); + snprintf(cbuf,STR_MAX_LEN*2, "Stage 2 early-return due to factor found; MaxErr = %10.9f.\n",MME); } else { - snprintf_nowarn(cbuf,STR_MAX_LEN, "Stage 2 done; MaxErr = %10.9f. Taking GCD...\n",MME); + snprintf(cbuf,STR_MAX_LEN*2, "Stage 2 done; MaxErr = %10.9f. Taking GCD...\n",MME); } mlucas_fprint(cbuf,pm1_standlone+scrnFlag); #endif @@ -2572,7 +2572,7 @@ MME = 0; // while(tpool->tasks_queue.num_tasks != 0) { //*** not safe, since can have #tasks == 0 with some tasks still in flight *** while(tpool->free_tasks_queue.num_tasks != NTHREADS) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep re-call-on-signal fail!"); // printf("sleep; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); } // printf("end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); diff --git a/src/qfcheb.h b/src/qfcheb.h index 6a8a23be..78a554ca 100755 --- a/src/qfcheb.h +++ b/src/qfcheb.h @@ -1,2 +1,2 @@ #define STR_MAX_LEN 1024 -extern char cbuf[STR_MAX_LEN]; +extern char cbuf[STR_MAX_LEN*2]; diff --git a/src/qfloat.c b/src/qfloat.c index ca243c76..bc171f2f 100755 --- a/src/qfloat.c +++ b/src/qfloat.c @@ -308,7 +308,7 @@ uint32 qfcmplt(struct qfloat q1, struct qfloat q2) case(3) : /* Both q1 and q2 negative, in which case a more-negative q1 looks larger w.r.to the unsigned compare */ return (q1.hi > q2.hi || (q1.hi == q2.hi && q1.lo > q2.lo)); default: - ASSERT(HERE, 0,"ERROR 98 in qfloat.c"); + ASSERT(0,"ERROR 98 in qfloat.c"); } return 0; /* Just to get the compiler to shut up ... this should never be reached. */ } @@ -398,9 +398,9 @@ long double qfldbl(struct qfloat x) long double ld; uint64 *ld_ptr = (uint64 *)&ld, nonhidden; int32 exp = (int32)((x.hi & ~MASK_SIGN)>>52); - ASSERT(HERE, sizeof(long double) == 16, "QFLDBL assumes 16-byte long double type!"); + ASSERT(sizeof(long double) == 16, "QFLDBL assumes 16-byte long double type!"); // Denormal check: - ASSERT(HERE, (exp != 0) && (exp != 0x7ff), "QFLDBL requires normal input!"); + ASSERT((exp != 0) && (exp != 0x7ff), "QFLDBL requires normal input!"); exp -= (int32)0x400; // x87 80-bit reg-format has 4 more bits in exp, centered around 0x4000 rather than 0x400 nonhidden = ((x.hi & MASK_MANT)<<11) + (x.lo>>53) + ((x.lo>>52)&0x1); // Rounding of the off-shifted portion may cause nonhidden-bit summation to overflow into sign bit: @@ -420,12 +420,12 @@ struct qfloat ldbl_to_q(long double x) long double ld = x; uint64 *ld_ptr = (uint64 *)&ld, x87_mant, x87_sexp; // Note high 48 bits of x87_sexp are uninited int32 exp; - DBG_ASSERT(HERE, sizeof(long double) == 16, "LDBL_TO_Q assumes 16-byte long double type!"); + DBG_ASSERT(sizeof(long double) == 16, "LDBL_TO_Q assumes 16-byte long double type!"); x87_mant = *ld_ptr; x87_sexp = *(ld_ptr+1); if(!x87_mant) return QZRO; // Denormal check: exp = (int32)(((x87_sexp<<48) & ~MASK_SIGN)>>48) - (int32)0x4000; - ASSERT(HERE, ABS(exp) <= 0x3ff, "LDBL_TO_Q requires double-compatible normal input!"); + ASSERT(ABS(exp) <= 0x3ff, "LDBL_TO_Q requires double-compatible normal input!"); q.hi = ((x87_sexp<<48) & MASK_SIGN) + ((uint64)((int32)0x400 + exp)<<52) + ((x87_mant>>11) & MASK_MANT); q.lo = (x87_mant<<53); return q; @@ -508,7 +508,7 @@ struct qfloat i128_to_q(uint128 i) q.hi = sexp + (i.d1 >> rshift); offword = (i.d0 << lshift) >> 63; // MSB of off-shifted portion q.lo = (i.d1 << lshift) + (i.d0 >> rshift) + offword; - ASSERT(HERE, q.lo >= offword, "Ripple-carry!"); // For now, just check for ripple-carry. Proper handling will come later. + ASSERT(q.lo >= offword, "Ripple-carry!"); // For now, just check for ripple-carry. Proper handling will come later. } else /* need to left-shift mantissa */ { @@ -547,7 +547,7 @@ struct qfloat qfmul_pow2(struct qfloat q, int32 pow) qt.hi += sgn; // Restore sign } } else if(exp1 >> 11) { // Overflow: exp+pow carried into sign-bit slpt: - ASSERT(HERE, 0,"OVERFLOW in qfmul_pow2"); + ASSERT(0,"OVERFLOW in qfmul_pow2"); } else { // Result is normal if(exp0) { // If normal input, update exponent field and return: @@ -560,11 +560,11 @@ struct qfloat qfmul_pow2(struct qfloat q, int32 pow) lz = QLEADZ(qt) - 11; // Number of leading zero bits in denormalized mantissa (i.e. shift count needed to move leading bit into hidden-bit location) if(pow > lz) { // Result will be normal QLSHIFT(qt, lz, qt); - ASSERT(HERE, (qt.hi>>52) == 1, "Bad mantissa left-shift count in qfmul_pow2!"); + ASSERT((qt.hi>>52) == 1, "Bad mantissa left-shift count in qfmul_pow2!"); qt.hi += (((uint64)pow-lz)<<52) - TWOE52; // Don't fold -TWOE52 in via (pow-lz-1)<<52, since may have pow = lz here. } else { // Result still denormal QLSHIFT(qt, pow, qt); - ASSERT(HERE, (qt.hi>>52) == 0, "Bad mantissa left-shift count in qfmul_pow2!"); + ASSERT((qt.hi>>52) == 0, "Bad mantissa left-shift count in qfmul_pow2!"); } } qt.hi += sgn; // Restore sign @@ -584,7 +584,7 @@ uint128 qfnint(struct qfloat q) uint64 offword, carry; uint128 i; i.d1 = q.hi; i.d0 = q.lo; - ASSERT(HERE, qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!"); + ASSERT(qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!"); /* Separate upper part of the significand from the sign/exponent fields: */ sign = (int32)(i.d1 >> 63); exp = (int32)(i.d1 >> 52) & MASK_EXP; @@ -667,7 +667,7 @@ uint128 qfint(struct qfloat q) int32 exp, sign, rshift, lshift; uint128 i; i.d1 = q.hi; i.d0 = q.lo; - ASSERT(HERE, qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!"); + ASSERT(qfcmpge(q, qfneg(two127)) && qfcmplt(q, two127), "QFNINT input out of range!"); /* Separate upper part of the significand from the sign/exponent fields: */ sign = (int32)(i.d1 >> 63); @@ -692,7 +692,7 @@ uint128 qfint(struct qfloat q) { if(rshift == -11 && (!sign || (i.d1 << -rshift) != MASK_SIGN || i.d0 != (uint64)0)) { - ASSERT(HERE, 0,"ERROR: qfloat is too large to convert to 128-bit integer."); + ASSERT(0,"ERROR: qfloat is too large to convert to 128-bit integer."); } } lshift = - rshift; @@ -859,7 +859,7 @@ char* qf2str(struct qfloat q) --pow10; } } - ASSERT(HERE, mi64_getlen(u,len) == 17 && u[16] && u[16] < 10, "QF2STRING: Normalization error!"); + ASSERT(mi64_getlen(u,len) == 17 && u[16] && u[16] < 10, "QF2STRING: Normalization error!"); os[1] = u[16] + '0'; // Put MSD to left of decimal point os[2] = '.'; for(i = 3; i < 38; ++i) { @@ -981,7 +981,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) else { printf("Multiply by denormalized operand not supported!"); - // ASSERT(HERE, 0,"ERROR in qfloat.c : qfmul"); + // ASSERT(0,"ERROR in qfloat.c : qfmul"); return QZRO; } @@ -992,7 +992,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) else { printf("Multiply by denormalized operand not supported!"); - // ASSERT(HERE, 0,"ERROR in qfloat.c : qfmul"); + // ASSERT(0,"ERROR in qfloat.c : qfmul"); return QZRO; } @@ -1005,7 +1005,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) if( ((sexp1 & ~MASK_SIGN) + (sexp2 & ~MASK_SIGN)) < 0x4000000000000000ull) { #if QFDEBUG WARN(HERE, "DENORM result in QFMUL ... flushing to 0.\n", "", 0); - ASSERT(HERE, fabs(qfdbl(q1)*qfdbl(q2)) < 1e-300, "Incorrect DENORM result in QFMUL!"); + ASSERT(fabs(qfdbl(q1)*qfdbl(q2)) < 1e-300, "Incorrect DENORM result in QFMUL!"); #endif return QZRO; } @@ -1046,8 +1046,8 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) /* DEBUG: make sure we didn't lose any bits of b or d during the conversion to float. */ #if QFDEBUG - if(!((uint64)db == b)) ASSERT(HERE, 0,"ERROR 120 in qfloat.c"); - if(!((uint64)dd == d)) ASSERT(HERE, 0,"ERROR 121 in qfloat.c"); + if(!((uint64)db == b)) ASSERT(0,"ERROR 120 in qfloat.c"); + if(!((uint64)dd == d)) ASSERT(0,"ERROR 121 in qfloat.c"); #endif adhi = da*dd; @@ -1056,7 +1056,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) #endif // USE_FMUL_FOR_LOW_WORD bvac = (uint64)leadz64(hi); - ASSERT(HERE, (bvac < 2), "ERROR 130 in qfloat.c"); /* Make sure at most the leftmost bit of high part is vacant. This check + ASSERT((bvac < 2), "ERROR 130 in qfloat.c"); /* Make sure at most the leftmost bit of high part is vacant. This check needs to remain in place until support for denormalized oprands is added. */ /* Now need to right-shift MUL_LOHI result (12-bvac) places, FMUL results (1-bvac) place, and add together. @@ -1068,12 +1068,12 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) /* lo = (hi << 52) + (lo >> 12) + (lo & 1) + (uint64)(adhi + bchi); hi = (hi >> 12); - printf("mul_hi = %16llX = %20llu\n", hi, hi); - printf("mul_lo = %16llX = %20llu\n", lo, lo); + printf("mul_hi = %16" PRIX64 " = %20" PRIu64 "\n", hi, hi); + printf("mul_lo = %16" PRIX64 " = %20" PRIu64 "\n", lo, lo); */ return_val.hi = (hi >> (11-bvac)); - ASSERT(HERE, (return_val.hi >> 52) == 1, "ERROR 140 in qfloat.c"); + ASSERT((return_val.hi >> 52) == 1, "ERROR 140 in qfloat.c"); return_val.lo = (hi << (53+bvac)) + (lo >> (11-bvac)) + ((lo >> (10-bvac)) & (uint64)1) + (((uint64)adhi + (uint64)bchi) << bvac); /* ^^^^rounding is here^^^^^ */ /* Maximize accuracy by converting to int prior to add. */ if(return_val.lo < (hi << (53+bvac))) /* had a carry out of lo part. */ @@ -1101,7 +1101,7 @@ struct qfloat qfmul(struct qfloat q1, struct qfloat q2) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFMUL!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFMUL!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFMUL!\n"); } #endif @@ -1120,7 +1120,7 @@ struct qfloat qfinc(struct qfloat x) return qfneg(qfdec(q)); // For x < 0, Use x + 1 = 1 - |x| = -(|x| - 1) } else { // Check for denormal (over and underflow): - ASSERT(HERE, ((q.hi>>52) + 1) >= 2, "Denormal not supported!"); + ASSERT(((q.hi>>52) + 1) >= 2, "Denormal not supported!"); if(q.hi < QONE.hi) { // This is just the significand-add section of qfsum with the following argument value specializations: @@ -1228,14 +1228,14 @@ struct qfloat qfinc(struct qfloat x) } } #if QFDEBUG - ASSERT(HERE, qfcmpeq(q, qfadd(x,QONE)), "qfinc fails!"); + ASSERT(qfcmpeq(q, qfadd(x,QONE)), "qfinc fails!"); #endif return q; } struct qfloat qfdec(struct qfloat q) { - ASSERT(HERE, 0, "qfdec not supported yet!"); + ASSERT(0, "qfdec not supported yet!"); return qfsub(q, QONE); } @@ -1272,7 +1272,7 @@ struct qfloat qfadd (struct qfloat q1, struct qfloat q2) } else { - ASSERT(HERE, 0,"ERROR: unrecognized sign combination in QFADD"); + ASSERT(0,"ERROR: unrecognized sign combination in QFADD"); } #if QFDEBUG double qres = qfdbl(q), dres = (1-2.0*sgn1)*qfdbl(q1) + (1-2.0*sgn2)*qfdbl(q2); // Must cast sgn1,2 to double prior to 1-... @@ -1282,7 +1282,7 @@ struct qfloat qfadd (struct qfloat q1, struct qfloat q2) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFADD!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFADD!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFADD!\n"); } #endif @@ -1316,7 +1316,7 @@ struct qfloat qfsub (struct qfloat q1, struct qfloat q2) } else { - ASSERT(HERE, 0,"ERROR: unrecognized sign combination in QFSUB"); + ASSERT(0,"ERROR: unrecognized sign combination in QFSUB"); } #if QFDEBUG double qres = qfdbl(q), dres = (1-2.0*sgn1)*qfdbl(q1) - (1-2.0*sgn2)*qfdbl(q2); // Must cast sgn1,2 to double prior to 1-... @@ -1326,7 +1326,7 @@ struct qfloat qfsub (struct qfloat q1, struct qfloat q2) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFSUB!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFSUB!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFSUB!\n"); } #endif @@ -1368,7 +1368,7 @@ struct qfloat qfsum(struct qfloat q1, struct qfloat q2) uint64 exp0, exp1, hi0, hi1, lo0, lo1, offword; /* Make sure both inputs are nonnegative. */ - DBG_ASSERT(HERE, ((int64)q1.hi >= 0 && (int64)q2.hi >= 0),"ERROR 160 in qfloat.c"); + DBG_ASSERT(((int64)q1.hi >= 0 && (int64)q2.hi >= 0),"ERROR 160 in qfloat.c"); /* Larger of the two operands gets index 0 in our local length-2 arrays: */ if(qfcmple(q2, q1)) @@ -1481,7 +1481,7 @@ struct qfloat qfsum(struct qfloat q1, struct qfloat q2) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFSUM!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFSUM!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFSUM!\n"); } #endif return(return_val); @@ -1510,7 +1510,7 @@ struct qfloat qfdif(struct qfloat q1, struct qfloat q2) uint64 exp0, exp1, hi0, hi1, lo0, lo1, offword; /* Make sure both inputs are nonnegative. */ - DBG_ASSERT(HERE, ((int64)q1.hi >= 0) && ((int64)q2.hi >= 0),"ERROR 170 in qfloat.c"); + DBG_ASSERT(((int64)q1.hi >= 0) && ((int64)q2.hi >= 0),"ERROR 170 in qfloat.c"); /* Larger of the two operands gets index 0 in our local length-2 arrays: */ if(qfcmple(q2, q1)) @@ -1635,7 +1635,7 @@ struct qfloat qfdif(struct qfloat q1, struct qfloat q2) else /* Hi part is zero. Assuming lo part has lzlo lead zeros, right-shift lo (11-lzlo) bits and put that into hi. */ { #if QFDEBUG - printf("WARNING: catastrophic loss of precision in subtract:\n %16llX %16llX -\n %16llX %16llX\n", ptr0->hi, ptr0->lo, ptr1->hi, ptr1->lo); + printf("WARNING: catastrophic loss of precision in subtract:\n %16" PRIX64 " %16" PRIX64 " -\n %16" PRIX64 " %16" PRIX64 "\n", ptr0->hi, ptr0->lo, ptr1->hi, ptr1->lo); #endif // return QZRO; *** Taking the easy way out breaks older already-tested stuff in qtest() *** if((int32)rshift > 0) /* Lo part has > 53 SB, upper 53 of which get put into hi part. */ @@ -1673,7 +1673,7 @@ struct qfloat qfdif(struct qfloat q1, struct qfloat q2) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFDIF!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFDIF!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFDIF!\n"); } #endif @@ -1709,7 +1709,7 @@ struct qfloat qfinv(struct qfloat x) /* Make sure x is properly normalized. This also catches potential divides-by-zero. */ if((x.hi & ~(MASK_SIGN + MASK_MANT)) == (uint64)0) { - ASSERT(HERE, 0,"ERROR: divide by denormalized input not supported."); + ASSERT(0,"ERROR: divide by denormalized input not supported."); } #ifdef X87_ASM ld = qfldbl(x); @@ -1750,7 +1750,7 @@ struct qfloat qfinv(struct qfloat x) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFINV!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFINV!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFINV!\n"); } #endif @@ -1773,7 +1773,7 @@ struct qfloat qfdiv(struct qfloat q1, struct qfloat q2) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFDIV!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFDIV!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFDIV!\n"); } #endif return qinv; @@ -1814,7 +1814,7 @@ struct qfloat qfsqrt(struct qfloat x) #endif /* Make sure x is nonnegative. This also catches potential divides-by-zero. */ - ASSERT(HERE, !(x.hi >> 63),"ERROR: sqrt of a negative number not supported."); + ASSERT(!(x.hi >> 63),"ERROR: sqrt of a negative number not supported."); if(qfcmpeq(x, QZRO)) return QZRO; #ifdef X87_ASM ld = qfldbl(x); @@ -1855,7 +1855,7 @@ struct qfloat qfsqrt(struct qfloat x) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFSQRT!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFSQRT!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFSQRT!\n"); } #endif /* Multiply 1/sqrt(x) by x to get sqrt(x). */ @@ -1871,7 +1871,7 @@ struct qfloat qisqrt(struct qfloat q) /* Make sure q is nonnegative. This also catches potential divides-by-zero. */ if(q.hi >> 63) { - ASSERT(HERE, 0,"ERROR: sqrt of a negative number not supported."); + ASSERT(0,"ERROR: sqrt of a negative number not supported."); } else if(qfcmpeq(q, QZRO)) { @@ -1933,7 +1933,7 @@ struct qfloat qfagm(struct qfloat x, struct qfloat y) if(idiff < 8) break; } } - ASSERT(HERE, (i < 20), "Failure to converge in QFAGM!"); + ASSERT((i < 20), "Failure to converge in QFAGM!"); return a; } @@ -2014,14 +2014,14 @@ struct qfloat qflog(struct qfloat x) #endif struct qfloat expy; - ASSERT(HERE, qfcmplt(QZRO,x), "Arg must be > 0 in QFLOG!"); + ASSERT(qfcmplt(QZRO,x), "Arg must be > 0 in QFLOG!"); #if 0 // Algo 0 uint32 efield,k; // Find smallest k s.t. y = (2^k)*x >= 2^64: /* Extract 11-bit exponent field and add sign-extended power-of-2 exponent: */ - efield = ((x.hi >> 52) & MASK_EXP); ASSERT(HERE, efield,"Denormalized numbers not currently supported in QFLOG"); + efield = ((x.hi >> 52) & MASK_EXP); ASSERT(efield,"Denormalized numbers not currently supported in QFLOG"); // 1.0 has efield = 0x3FF, so use that to compute k: k = 64 - (efield - 0x3FF); y = qfmul_pow2(x, (uint64)k); @@ -2077,7 +2077,7 @@ struct qfloat qflog(struct qfloat x) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFLOG!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFLOG!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFLOG!\n"); } #endif return y; @@ -2142,8 +2142,8 @@ struct qfloat qfexp(struct qfloat x) pow2 = (xabs.hi >> 52) - 0x3fd; if(abs(pow2) > 9) { // If arg > +- 512, check for over/underflow which occurs for |arg| ~> 700 if(darg > 700) { - fprintf(stderr,"QFEXP: xabs.hi = %16llX, pow2 = %u, darg = %10.10e\n",xabs.hi,pow2,darg); - ASSERT(HERE,0,"expo overflow!"); + fprintf(stderr,"QFEXP: xabs.hi = %16" PRIX64 ", pow2 = %u, darg = %10.10e\n",xabs.hi,pow2,darg); + ASSERT(0,"expo overflow!"); } else if(darg < -700) { return QZRO; // expo underflow flushes to zero } @@ -2206,7 +2206,7 @@ struct qfloat qfexp(struct qfloat x) } else { nterm_idx = i - 0x3C5; // printf("Input exp-field = %3X, nterm_idx = %d; \n",i,nterm_idx); - ASSERT(HERE, nterm_idx < 64, "nterm_idx ou of range!"); + ASSERT(nterm_idx < 64, "nterm_idx ou of range!"); nterm = nterm_arr[nterm_idx]; } @@ -2262,7 +2262,7 @@ struct qfloat qfexp(struct qfloat x) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFEXP!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFEXP!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFEXP!\n"); } #endif return y; @@ -2318,7 +2318,7 @@ struct qfloat qfatan(struct qfloat x) if( rerr > 1e-12 ) { WARN(HERE, "High Error Level in QFATAN!\n", "", 0); } - ASSERT(HERE, rerr < 1e-2, "Fatal ROE in QFATAN!\n"); + ASSERT(rerr < 1e-2, "Fatal ROE in QFATAN!\n"); } #endif return y; @@ -2458,7 +2458,7 @@ struct qfloat qfcos (struct qfloat q) i = qfint(qt); // ...And truncate that to the next-smaller integer (128-bit int in this case). // For the quadrant, we only need the result modulo 4: quad = i.d0 & (uint64)3; - ASSERT(HERE, !i.d1 && (int64)i.d0 >= 0,"QFCOS: quadrant error"); + ASSERT(!i.d1 && (int64)i.d0 >= 0,"QFCOS: quadrant error"); qt = i64_to_q((int64)i.d0); // The calling argument is q mod pi/2: q = qfsub(q, qfmul(qt, QPIHALF)); @@ -2483,7 +2483,7 @@ struct qfloat qfsin (struct qfloat q) i = qfint(qt); // ...And truncate that to the next-smaller integer (128-bit int in this case). // For the quadrant, we only need the result modulo 4: quad = i.d0 & (uint64)3; - ASSERT(HERE, !i.d1 && (int64)i.d0 >= 0,"QFSIN: quadrant error"); + ASSERT(!i.d1 && (int64)i.d0 >= 0,"QFSIN: quadrant error"); qt = i64_to_q((int64)i.d0); // The calling argument is q mod pi/2: q = qfsub(q, qfmul(qt, QPIHALF)); @@ -2563,7 +2563,7 @@ struct qfloat qfcs1(struct qfloat q) static int first_entry = TRUE; if(first_entry) { first_entry = FALSE; - denoms = (struct qfloat *)malloc(20*qsz); ASSERT(HERE, denoms != NULL, "alloc failed!"); + denoms = (struct qfloat *)malloc(20*qsz); ASSERT(denoms != NULL, "alloc failed!"); for(i = 4; i < 38; i += 4) // Must limit largest index into QNINV[] to 40, hence (i+2) < 40 { j = (i>>1)-1; @@ -2573,7 +2573,7 @@ struct qfloat qfcs1(struct qfloat q) } #endif /* Make sure argument is in range... */ - DBG_ASSERT(HERE, (qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 200 in qfloat.c"); + DBG_ASSERT((qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 200 in qfloat.c"); #if (USE_CHEB_EXPANSION == 1) // Branchless algorithm: @@ -2621,7 +2621,7 @@ struct qfloat qfcs1(struct qfloat q) break; } } - ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged cos(x) summation!"); + ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged cos(x) summation!"); #elif (USE_CHEB_EXPANSION == 2) @@ -2686,7 +2686,7 @@ struct qfloat qfsn1(struct qfloat q) static int first_entry = TRUE; if(first_entry) { first_entry = FALSE; - denoms = (struct qfloat *)malloc(20*qsz); ASSERT(HERE, denoms != NULL, "alloc failed!"); + denoms = (struct qfloat *)malloc(20*qsz); ASSERT(denoms != NULL, "alloc failed!"); for(i = 3; i < 38; i += 4) // Must limit largest index into QNINV[] to 40, hence (i+2) < 40 { j = (i>>1); @@ -2696,7 +2696,7 @@ struct qfloat qfsn1(struct qfloat q) } #endif /* Make sure argument is in range... */ - DBG_ASSERT(HERE, (qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 210 in qfloat.c"); + DBG_ASSERT((qfcmple(qfneg(QEPS), q) && qfcmplt(q, qfadd(QPIHALF, QEPS))), "ERROR 210 in qfloat.c"); #if (USE_CHEB_EXPANSION == 1) // Branchless algorithm: @@ -2743,7 +2743,7 @@ struct qfloat qfsn1(struct qfloat q) break; } } - ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged sin(x) summation!"); + ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged sin(x) summation!"); #elif (USE_CHEB_EXPANSION == 2) @@ -2835,7 +2835,7 @@ struct qfloat qfcosh(struct qfloat q) static int first_entry = TRUE; if(first_entry) { first_entry = FALSE; - denoms = (struct qfloat *)malloc(20*qsz); ASSERT(HERE, denoms != NULL, "alloc failed!"); + denoms = (struct qfloat *)malloc(20*qsz); ASSERT(denoms != NULL, "alloc failed!"); for(i = 4; i < 38; i += 4) // Limit largest index into QNINV[] to 40, hence (i+2) < 40 (--> i_max = 36 here) { j = (i>>1)-1; @@ -2883,7 +2883,7 @@ struct qfloat qfcosh(struct qfloat q) curr_term = qfmul(curr_term, mult); e_sum = (uint32)((sum .hi & ~MASK_SIGN) >> 52); e_new = (uint32)((curr_term.hi & ~MASK_SIGN) >> 52); - ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged cosh(x) summation!"); + ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged cosh(x) summation!"); } return sum; } // |x| >= 2 ? @@ -2904,7 +2904,7 @@ struct qfloat qfsinh(struct qfloat q) static int first_entry = TRUE; if(first_entry) { first_entry = FALSE; - denoms = (struct qfloat *)malloc(20*qsz); ASSERT(HERE, denoms != NULL, "alloc failed!"); + denoms = (struct qfloat *)malloc(20*qsz); ASSERT(denoms != NULL, "alloc failed!"); for(i = 3; i < 38; i += 4) // Must limit largest index into QNINV[] to 40, hence (i+2) < 40 { j = (i>>1); @@ -2951,7 +2951,7 @@ struct qfloat qfsinh(struct qfloat q) curr_term = qfmul(curr_term, mult); e_sum = (uint32)((sum .hi & ~MASK_SIGN) >> 52); e_new = (uint32)((curr_term.hi & ~MASK_SIGN) >> 52); - ASSERT(HERE, ((int32)(e_sum - e_new) > 115), "Unconverged sinh(x) summation!"); + ASSERT(((int32)(e_sum - e_new) > 115), "Unconverged sinh(x) summation!"); } return sum; } // |x| >= 2 ? @@ -3013,20 +3013,20 @@ int qtest(void) uint64 *ld_ptr = &ld, x87_mant, x87_sexp; // Test I/O functions: - ASSERT(HERE, STREQ( qf2str(QPI), "+3.14159265358979323846264338327950289 E+000" ), "I/O test failed!"); + ASSERT(STREQ( qf2str(QPI), "+3.14159265358979323846264338327950289 E+000" ), "I/O test failed!"); asm ("fldln2;" "fstpt %0" : "=m"(ld) : ); x87_mant = *ld_ptr; x87_sexp = *(ld_ptr+1) & 0x000000000000FFFFull; // Mask off high 48 bits of x87_sexp field, as these are uninited if(x87_mant != 0xB17217F7D1CF79ACull) { printf("ln2 = %30.20Le\n", ld); - printf("x87_mant = %16llx, expected 0xB17217F7D1CF79ACull\n", x87_mant); // x87_mant = b17217f7d1cf79ac, left-shift one place to off-shift hidden bit + printf("x87_mant = %16" PRIx64 ", expected 0xB17217F7D1CF79ACull\n", x87_mant); // x87_mant = b17217f7d1cf79ac, left-shift one place to off-shift hidden bit WARN(HERE, "Ln2 long-double mantissa conversion error", "", 0); } -// ASSERT(HERE, x87_mant == 0xB17217F7D1CF79ACull, "Ln2 long-double mantissa conversion error"); +// ASSERT(x87_mant == 0xB17217F7D1CF79ACull, "Ln2 long-double mantissa conversion error"); -// printf("x87_sexp = %16llx\n", x87_sexp); // x87_sexp = 3ffe, clear high 4 bits to get qfloat/double-compatible exp-field - ASSERT(HERE, x87_sexp == 0x0000000000003FFEull, "Ln2 long-double exponent conversion error"); +// printf("x87_sexp = %16" PRIx64 "\n", x87_sexp); // x87_sexp = 3ffe, clear high 4 bits to get qfloat/double-compatible exp-field + ASSERT(x87_sexp == 0x0000000000003FFEull, "Ln2 long-double exponent conversion error"); asm ("fld1;" "fadd %%st(0), %%st(0);" @@ -3037,17 +3037,17 @@ int qtest(void) if(x87_mant != 0xB504F333F9DE6484ull) { printf("-Sqrt2 = %30.20Le\n", ld); - printf("x87_mant = %16llx, expected 0xB504F333F9DE6484ull\n", x87_mant); + printf("x87_mant = %16" PRIx64 ", expected 0xB504F333F9DE6484ull\n", x87_mant); WARN(HERE, "-Sqrt2 long-double mantissa conversion error", "", 0); } -// ASSERT(HERE, x87_mant == 0xB504F333F9DE6484ull, "-Sqrt2 long-double mantissa conversion error"); +// ASSERT(x87_mant == 0xB504F333F9DE6484ull, "-Sqrt2 long-double mantissa conversion error"); -// printf("x87_sexp = %16llx\n", x87_sexp); - ASSERT(HERE, x87_sexp == 0x000000000000BFFFull, "-Sqrt2 long-double exponent conversion error"); +// printf("x87_sexp = %16" PRIx64 "\n", x87_sexp); + ASSERT(x87_sexp == 0x000000000000BFFFull, "-Sqrt2 long-double exponent conversion error"); #endif - ASSERT(HERE, (ABS((int64)0x1234567890ABCDEFull) == 0x1234567890ABCDEFull), "ERROR 10 in qfloat.c"); + ASSERT((ABS((int64)0x1234567890ABCDEFull) == 0x1234567890ABCDEFull), "ERROR 10 in qfloat.c"); /*********** TEST THE TYPE CONVERSIONS **************/ #if TIMING_TEST @@ -3060,7 +3060,7 @@ int qtest(void) td += qfdbl(QEXP); } clock2 = clock(); - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; cycles /= 4.0*(double)titers; @@ -3069,52 +3069,52 @@ int qtest(void) #endif c = 0.0; d = qfdbl(QZRO); #if QFDEBUG - printf("dble(0.0) = %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(0.0) = %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 12 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 12 in qfloat.c"); c = 1.0; d = qfdbl(QONE); #if QFDEBUG - printf("dble(1.0) = %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(1.0) = %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 14 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 14 in qfloat.c"); c = 2.0; d = qfdbl(QTWO); #if QFDEBUG - printf("dble(2.0) = %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(2.0) = %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 16 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 16 in qfloat.c"); c =-2.0; d = qfdbl(qfneg(QTWO)); #if QFDEBUG - printf("dble(-2.0)= %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(-2.0)= %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(HERE, 0,"ERROR 18 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(hidiff == (int64)0)) ASSERT(0,"ERROR 18 in qfloat.c"); c = 2*pi; d = qfdbl(Q2PI); #if QFDEBUG - printf("dble(2pi) = %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(2pi) = %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 20 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 20 in qfloat.c"); c =log(2.0);d = qfdbl(QLN2); #if QFDEBUG - printf("dble(ln2) = %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(ln2) = %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 22 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 22 in qfloat.c"); c = exp(1.0); d = qfdbl(QEXP); #if QFDEBUG - printf("dble(exp) = %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(exp) = %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 24 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 24 in qfloat.c"); c = -c; d = qfdbl(qfneg(QEXP)); #if QFDEBUG - printf("dble(-exp)= %16llX %16llX\n",*(int64 *)&c, *(int64 *)&d); + printf("dble(-exp)= %16" PRIX64 " %16" PRIX64 "\n",*(int64 *)&c, *(int64 *)&d); #endif - hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(HERE, 0,"ERROR 26 in qfloat.c"); + hidiff = *(int64 *)&c - *(int64 *)&d; if(!(ABS(hidiff) < (int64)2)) ASSERT(0,"ERROR 26 in qfloat.c"); /*********** TEST THE MULTIPLY ALGORITHM ************/ #if TIMING_TEST @@ -3127,7 +3127,7 @@ int qtest(void) hidiff += qfmul_pow2(QLN2,+1).hi; hidiff += qfmul_pow2(QLN2,+1).hi; } - ASSERT(HERE, hidiff, "!"); + ASSERT(hidiff, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3140,7 +3140,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfmul(QEXP,QEXP)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3150,65 +3150,65 @@ int qtest(void) /* e*e: 0x401D8E64B8D4DDAD, 0xCC33A3BA206B68AC */ q = qfmul(QEXP,QEXP); #if QFDEBUG - printf(" e*e = %16llX %16llX\n",q.hi,q.lo); + printf(" e*e = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); printf("dble( e*e) = %25.16e\n",qfdbl(q)); #endif qref.hi = 0x401D8E64B8D4DDADull; qref.lo = 0xCC33A3BA206B68ACull; // This is better than the separate hi/lo-word test, since it allows for the ROE to be of either sign: qerr = qfabs(qfsub(q,qref)); // Div-by-eps same as mul-by-by-2^118 derr = qfdbl( qfmul_pow2(qerr,+118) ); // The threshold here typically needs to be ~16*[magnitude of output] - ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!"); /* ln2*e: 0x3FFE258ECC242F82, 0x5DEC567E6A0E1111 */ q = qfmul(QLN2,QEXP); #if QFDEBUG - printf(" L2*e = %16llX %16llX\n",q.hi,q.lo); + printf(" L2*e = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); printf("dble(L2*e) = %25.16e\n",qfdbl(q)); #endif qref.hi = 0x3FFE258ECC242F82ull; qref.lo = 0x5DEC567E6A0E1111ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!"); /* ln2^2: 0x3FDEBFBDFF82C58E, 0xA86F16B06EC97360 */ q = qfmul(QLN2,QLN2); #if QFDEBUG - printf(" L2^2 = %16llX %16llX\n",q.hi,q.lo); + printf(" L2^2 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); printf("dble(L2^2) = %25.16e\n",qfdbl(q)); #endif qref.hi = 0x3FDEBFBDFF82C58Eull; qref.lo = 0xA86F16B06EC97360ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!"); /* ln2*2pi: 0x40116BB24190A0B6, 0xE765BE0D06135E60 */ q = qfmul(QLN2,Q2PI); #if QFDEBUG - printf(" Ln2*pi = %16llX %16llX\n",q.hi,q.lo); + printf(" Ln2*pi = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); printf("dble(Ln2*pi)= %25.16e\n",qfdbl(q)); #endif qref.hi = 0x40116BB24190A0B6ull; qref.lo = 0xE765BE0D06135E60ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!"); /* 2pi*e: 0x403114580B45D474, 0x9E6108579A2D0CA7 */ q = qfmul(Q2PI,QEXP); #if QFDEBUG - printf(" pi*e = %16llX %16llX\n",q.hi,q.lo); + printf(" pi*e = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); printf("dble(pi*e) = %25.16e\n",qfdbl(q)); #endif qref.hi = 0x403114580B45D474ull; qref.lo = 0x9E6108579A2D0CA7ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 128.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 128.0 ,"ERROR in QFMUL error-level check!"); /* 2pi*2pi: 0x4043BD3CC9BE45DE, 0x5A4ADC4D9B301183 */ q = qfmul(Q2PI,Q2PI); #if QFDEBUG - printf(" (2*pi)^2 = %16llX %16llX\n",q.hi,q.lo); + printf(" (2*pi)^2 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); printf("dble(2pi^2)= %25.16e\n",qfdbl(q)); printf("dble(2pi^2)= %25.16e\n",pi*pi); #endif qref.hi = 0x4043BD3CC9BE45DEull; qref.lo = 0x5A4ADC4D9B301183ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 128.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 128.0 ,"ERROR in QFMUL error-level check!"); /*********** TEST THE ADDITION ALGORITHM ************/ #if TIMING_TEST @@ -3217,7 +3217,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfadd(QEXP,QEXP)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3227,11 +3227,11 @@ int qtest(void) /* 2*pi+e: 0x402200C04CE72C66, 0x7821CB48D9B947AC */ q = qfadd(QEXP,Q2PI); #if QFDEBUG - printf(" 2*pi + e = %16llX %16llX\n",q.hi,q.lo); + printf(" 2*pi + e = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x402200C04CE72C66ull; qref.lo = 0x7821CB48D9B947ACull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 64.0 ,"ERROR in QFMUL error-level check!"); + ASSERT(derr < 64.0 ,"ERROR in QFMUL error-level check!"); /********** TEST THE SUBTRACTION ALGORITHM **********/ #if TIMING_TEST @@ -3240,7 +3240,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfsub(QEXP,QEXP)); } - ASSERT(HERE, td == 0.0, "!"); + ASSERT(td == 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3251,69 +3251,69 @@ int qtest(void) q.hi = 0x3FEFFFFFFFFFFFFFull; q.lo = 0xFFFFFFFFFFFFFFFFull; q = qfsub(q, q); #if QFDEBUG - printf("result1 = %16llX %16llX\n",q.hi,q.lo); + printf("result1 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = qref.lo = 0x0000000000000000ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!"); + ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!"); p.hi = 0x3FEFFFFFFFFFFFFFull; p.lo = 0xFFFFFFFFFFFFFFFFull; q.hi = 0x3FEFFFFFFFFFFFFFull; q.lo = 0xFFFFFFFFFFFFFFFEull; q = qfsub(p, q); #if QFDEBUG - printf("result2 = %16llX %16llX\n",q.hi,q.lo); + printf("result2 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x38A0000000000000ull; qref.lo = 0x0000000000000000ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!"); + ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!"); /* Both inputs normalized, output denormalized, with just one significant bit. */ p.hi = 0x00FFFFFFFFFFFFFFull; p.lo = 0xFFFFFFFFFFFFFFFFull; q.hi = 0x00FFFFFFFFFFFFFFull; q.lo = 0xFFFFFFFFFFFFFFFEull; q = qfsub(p, q); #if QFDEBUG - printf("result3 = %16llX %16llX\n",q.hi,q.lo); + printf("result3 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x0000000000000000ull; qref.lo = 0x0000000000004000ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!"); + ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!"); /* Both inputs denormalized, output zero */ q.hi = 0x000FFFFFFFFFFFFFull; q.lo = 0xFFFFFFFFFFFFFFFFull; q = qfsub(q, q); #if QFDEBUG - printf("result4 = %16llX %16llX\n",q.hi,q.lo); + printf("result4 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = qref.lo = 0ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!"); + ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!"); /* Both inputs denormalized, output denormalized, with just one significant bit. */ p.hi = 0x000FFFFFFFFFFFFFull; p.lo = 0xFFFFFFFFFFFFFFFFull; q.hi = 0x000FFFFFFFFFFFFFull; q.lo = 0xFFFFFFFFFFFFFFFEull; q = qfsub(p, q); #if QFDEBUG - printf("result5 = %16llX %16llX\n",q.hi,q.lo); + printf("result5 = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0ull; qref.lo = 1ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!"); + ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!"); /* 2*pi-e: 0x400C84EC1D7402C7, 0x39DB360DDEDB4F60 */ q = qfsub(Q2PI,QEXP); #if QFDEBUG - printf(" 2pi- e = %16llX %16llX\n",q.hi,q.lo); + printf(" 2pi- e = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x400C84EC1D7402C7ull; qref.lo = 0x39DB360DDEDB4F60ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr == 0.0 ,"ERROR in QFSUB error-level check!"); + ASSERT(derr == 0.0 ,"ERROR in QFSUB error-level check!"); /* e-2*pi: 0xC00C84EC1D7402C7, 0x39DB360DDEDB4F60 */ r = qfsub(QEXP,Q2PI); #if QFDEBUG - printf(" e-2pi = %16llX %16llX\n",r.hi,r.lo); + printf(" e-2pi = %16" PRIX64 " %16" PRIX64 "\n",r.hi,r.lo); #endif - if(!(qfcmpeq(r, qfneg(q)))) ASSERT(HERE, 0,"ERROR 54 in qfloat.c"); + if(!(qfcmpeq(r, qfneg(q)))) ASSERT(0,"ERROR 54 in qfloat.c"); /*********** TEST THE SQUARE ROOT ALGORITHM ************/ #if TIMING_TEST @@ -3322,7 +3322,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfsqrt(QEXP)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3332,7 +3332,7 @@ int qtest(void) /* sqrt(2): 0x3FF6A09E667F3BCC, 0x908B2FB1366EA958, qfsqrt gives ...956. */ q = qfsqrt(QTWO); #if QFDEBUG - printf("sqrt(2) = %16llX %16llX\n",q.hi,q.lo); + printf("sqrt(2) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FF6A09E667F3BCCull; qref.lo = 0x908B2FB1366EA958ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3340,7 +3340,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFSQRT error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFSQRT error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFSQRT error-level check!"); /*********** TEST THE INVERSION AND DIVISION ALGORITHMS ************/ #if TIMING_TEST @@ -3349,7 +3349,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfinv(QEXP)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3359,7 +3359,7 @@ int qtest(void) /* 1/(2*pi):0x3FC45F306DC9C882, 0xA53F84EAFA3EA69B(B81B...), qfinv gives ...698. */ q = qfinv(Q2PI); #if QFDEBUG - printf("1/(2*pi) = %16llX %16llX\n",q.hi,q.lo); + printf("1/(2*pi) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FC45F306DC9C882ull; qref.lo = 0xA53F84EAFA3EA69Bull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3367,12 +3367,12 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFINV error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFINV error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFINV error-level check!"); /* 1/e: 0x3FD78B56362CEF37, 0xC6AEB7B1E0A4153E(4376...), qfinv gives ...53C. */ q = qfinv(QEXP); #if QFDEBUG - printf("1/e = %16llX %16llX\n",q.hi,q.lo); + printf("1/e = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FD78B56362CEF37ull; qref.lo = 0xC6AEB7B1E0A4153Eull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3380,12 +3380,12 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFINV error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFINV error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFINV error-level check!"); /* 1/ln2: 0x3FF71547652B82FE, 0x1777D0FFDA0D23A7(D11D...), qfinv gives ...3A6. */ q = qfinv(QLN2); #if QFDEBUG - printf("1/ln(2) = %16llX %16llX\n",q.hi,q.lo); + printf("1/ln(2) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FF71547652B82FEull; qref.lo = 0x1777D0FFDA0D23A7ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3393,7 +3393,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFINV error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFINV error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFINV error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3401,7 +3401,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfdiv(QEXP,QPI)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3411,7 +3411,7 @@ int qtest(void) /* 2*pi/ln2:0x40222123045B5DEB, 0x9C5398CE82C06E4B(80DB...), qfdiv gives ...E4A. */ q = qfdiv(Q2PI, QLN2); #if QFDEBUG - printf("2*pi/ln(2) = %16llX %16llX\n",q.hi,q.lo); + printf("2*pi/ln(2) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x40222123045B5DEBull; qref.lo = 0x9C5398CE82C06E4Bull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3419,7 +3419,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFDIV error-level check!", "", 0); } -// ASSERT(HERE, derr < 128.0 ,"ERROR in QFDIV error-level check!"); +// ASSERT(derr < 128.0 ,"ERROR in QFDIV error-level check!"); /*********** TEST THE TRANSCENDENTAL FUNCTIONS ************/ #if TIMING_TEST @@ -3428,7 +3428,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfsn1(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3443,7 +3443,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFSIN error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFSIN error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFSIN error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3451,7 +3451,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfcs1(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3466,7 +3466,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFCOS error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFCOS error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFCOS error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3474,7 +3474,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qftan(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3484,7 +3484,7 @@ int qtest(void) /* tan(Pi/4): Compare to 1: */ q = qftan(QPI4TH); #if QFDEBUG - printf("qtfan(PI/4) = %16llX %16llX\n",q.hi,q.lo); + printf("qtfan(PI/4) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = QONE.hi; qref.lo = QONE.lo; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3492,7 +3492,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFTAN error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFTAN error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFTAN error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3500,7 +3500,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfcot(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3510,7 +3510,7 @@ int qtest(void) /* cot(Pi/4): Compare to 1: */ q = qfcot(QPI4TH); #if QFDEBUG - printf("qfcot(PI/4) = %16llX %16llX\n",q.hi,q.lo); + printf("qfcot(PI/4) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = QONE.hi; qref.lo = QONE.lo; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3518,7 +3518,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFCOT error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFCOT error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFCOT error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3526,7 +3526,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfatan(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3536,7 +3536,7 @@ int qtest(void) /* atan(1): Compare to precomputed Pi/4: */ q = qfatan(QONE); #if QFDEBUG - printf("qatan(1) = %16llX %16llX\n",q.hi,q.lo); + printf("qatan(1) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = QPI4TH.hi; qref.lo = QPI4TH.lo; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3544,7 +3544,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFATAN error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFATAN error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFATAN error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3552,7 +3552,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qflog(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3562,18 +3562,18 @@ int qtest(void) /* log(2): Compare to precomputed QLN2 = {0x3FE62E42FEFA39EFull, 0x35793C7673007E5Full}: */ q = qflog(QTWO); #if QFDEBUG - printf("qlog(2) = %16llX %16llX\n",q.hi,q.lo); + printf("qlog(2) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FE62E42FEFA39EFull; qref.lo = 0x35793C7673007E5Full; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); - ASSERT(HERE, derr < 1100.0,"ERROR in QFLOG error-level check!"); // AGM-based log is fast but error-prone + ASSERT(derr < 1100.0,"ERROR in QFLOG error-level check!"); // AGM-based log is fast but error-prone /* log(2^64): Compare to precomputed log(2^64) = (same as log(2) but exp-field += 6): */ q = qfmul_pow2(QONE,+64); q = qflog(q); #if QFDEBUG - printf("qlog(2^64) = %16llX %16llX\n",q.hi,q.lo); + printf("qlog(2^64) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x40462E42FEFA39EFull; qref.lo = 0x35793C7673007E5Full; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3581,7 +3581,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFLOG error-level check!", "", 0); } -// ASSERT(HERE, derr < 1100.0 ,"ERROR in QFLOG error-level check!"); +// ASSERT(derr < 1100.0 ,"ERROR in QFLOG error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3589,7 +3589,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfexp(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3599,7 +3599,7 @@ int qtest(void) /* exp(1): 0x4005BF0A8B145769, 0x5355FB8AC404E7A7(9E3B...), qfexp gives ...4E7A7, ~116 bits of accuracy. */ q = qfexp(QONE); #if QFDEBUG - printf("qexp(1) = %16llX %16llX\n",q.hi,q.lo); + printf("qexp(1) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x4005BF0A8B145769ull; qref.lo = 0x5355FB8AC404E7A7ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3607,13 +3607,13 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFEXP error-level check!", "", 0); } -// ASSERT(HERE, derr <= 64.0 ,"ERROR in QFEXP error-level check!"); +// ASSERT(derr <= 64.0 ,"ERROR in QFEXP error-level check!"); /* Sine and cosine are somewhat roundoff-error prone, so raise the error limit slightly. */ /* cos(1): 0x3FE14A280FB5068B, 0x923848CDB2ED0E37(A534...), qfcs1 gives ...D0E38, ~116 bits of accuracy */ q = qfcs1(QONE); #if QFDEBUG - printf("qcs1(1) = %16llX %16llX\n",q.hi,q.lo); + printf("qcs1(1) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FE14A280FB5068Bull; qref.lo = 0x923848CDB2ED0E37ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3621,18 +3621,18 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFCS1 error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFCS1 error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFCS1 error-level check!"); r = qfcos(QONE); #if QFDEBUG - printf("qcos(1) = %16llX %16llX\n",r.hi,r.lo); + printf("qcos(1) = %16" PRIX64 " %16" PRIX64 "\n",r.hi,r.lo); #endif - if(!(qfcmpeq(r, q))) ASSERT(HERE, 0,"ERROR 70 in qfloat.c"); + if(!(qfcmpeq(r, q))) ASSERT(0,"ERROR 70 in qfloat.c"); /* sin(1): 0x3FEAED548F090CEE, 0x0418DD3D2138A1E7(8651...), qfsn1 gives ...8A1E9, ~115 bits of accuracy */ q = qfsn1(QONE); #if QFDEBUG - printf("qsn1(1) = %16llX %16llX\n",q.hi,q.lo); + printf("qsn1(1) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FEAED548F090CEEull; qref.lo = 0x0418DD3D2138A1E7ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3640,18 +3640,18 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFSN1 error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFSN1 error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFSN1 error-level check!"); r = qfsin(QONE); #if QFDEBUG - printf("qsin(1) = %16llX %16llX\n",r.hi,r.lo); + printf("qsin(1) = %16" PRIX64 " %16" PRIX64 "\n",r.hi,r.lo); #endif - if(!(qfcmpeq(r, q))) ASSERT(HERE, 0,"ERROR 74 in qfloat.c"); + if(!(qfcmpeq(r, q))) ASSERT(0,"ERROR 74 in qfloat.c"); /* cos(100):0x3FEB981DBF665FDF, 0x63F433736617A041(5D8A...), qfcos gives ...7A023, ~114 bits of accuracy */ q = qfcos(i64_to_q((int64)100)); #if QFDEBUG - printf("qcos(100) = %16llX %16llX\n",q.hi,q.lo); + printf("qcos(100) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0x3FEB981DBF665FDFull; qref.lo = 0x63F433736617A041ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3659,12 +3659,12 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFCOS error-level check!", "", 0); } -// ASSERT(HERE, derr < 128.0 ,"ERROR in QFCOS error-level check!"); +// ASSERT(derr < 128.0 ,"ERROR in QFCOS error-level check!"); /* sin(100):0xBFE03425B78C4DB8, 0x0708F6155D083EB2(1C6B...), qfsin gives ...83EE5, ~109 bits of accuracy */ q = qfsin(i64_to_q((int64)100)); #if QFDEBUG - printf("qsin(100) = %16llX %16llX\n",q.hi,q.lo); + printf("qsin(100) = %16" PRIX64 " %16" PRIX64 "\n",q.hi,q.lo); #endif qref.hi = 0xBFE03425B78C4DB8ull; qref.lo = 0x0708F6155D083EB2ull; qerr = qfabs(qfsub(q,qref)); derr = qfdbl( qfmul_pow2(qerr,+118) ); @@ -3672,7 +3672,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFSIN error-level check!", "", 0); } -// ASSERT(HERE, derr < 128.0 ,"ERROR in QFSIN error-level check!"); +// ASSERT(derr < 128.0 ,"ERROR in QFSIN error-level check!"); /*********** Test the hyperbolic-trigs: **********************/ #if TIMING_TEST @@ -3681,7 +3681,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfsinh(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3696,7 +3696,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFSINH error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFSINH error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFSINH error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3704,7 +3704,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qfcosh(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3719,7 +3719,7 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFCOSH error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFCOSH error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFCOSH error-level check!"); #if TIMING_TEST clock1 = clock(); @@ -3727,7 +3727,7 @@ int qtest(void) for(i = 0; i < titers; ++i) { td += qfdbl(qftanh(QLN2)); } - ASSERT(HERE, td != 0.0, "!"); + ASSERT(td != 0.0, "!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3742,11 +3742,11 @@ int qtest(void) printf("derr = %10.5f\n", derr); WARN(HERE, "ERROR in QFTANH error-level check!", "", 0); } -// ASSERT(HERE, derr < 16.0 ,"ERROR in QFTANH error-level check!"); +// ASSERT(derr < 16.0 ,"ERROR in QFTANH error-level check!"); /*********** TEST THE INT --> QFLOAT and ROUND-TOWARD-ZERO AND ROUND-TO-NEAREST FUNCTIONS ************/ - ASSERT(HERE, CMPEQ128( qfint(qfneg( i64_to_q( 0ull))), NIL128 ), "error!"); - ASSERT(HERE, CMPEQ128( qfint(qfneg(i128_to_q(NIL128))), NIL128 ), "error!"); + ASSERT(CMPEQ128( qfint(qfneg( i64_to_q( 0ull))), NIL128 ), "error!"); + ASSERT(CMPEQ128( qfint(qfneg(i128_to_q(NIL128))), NIL128 ), "error!"); #if TIMING_TEST clock1 = clock(); hidiff = lodiff = 0ull; @@ -3755,7 +3755,7 @@ int qtest(void) hidiff += i128.d1; lodiff += i128.d0; } - ASSERT(HERE, !hidiff && (lodiff == titers), "!"); // NINT(ln2) = 1, titers times + ASSERT(!hidiff && (lodiff == titers), "!"); // NINT(ln2) = 1, titers times clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; @@ -3765,9 +3765,9 @@ int qtest(void) q = qfmul_pow2(QONE, -1); i128 = qfnint(q); #if QFDEBUG - printf("qfnint(0.5) = %16llX %16llX\n",i128.d1,i128.d0); + printf("qfnint(0.5) = %16" PRIX64 " %16" PRIX64 "\n",i128.d1,i128.d0); #endif - ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)1),"ERROR 80 in qfloat.c"); + ASSERT((!i128.d1 && i128.d0 == (uint64)1),"ERROR 80 in qfloat.c"); #if TIMING_TEST clock1 = clock(); @@ -3777,25 +3777,25 @@ int qtest(void) hidiff += i128.d1; lodiff += i128.d0 + qfint(QPI).d0; } - ASSERT(HERE, !hidiff && (lodiff == 3*titers), "!"); // INT(ln2) = 0 and INT(pi) = 3, summed (titers) times + ASSERT(!hidiff && (lodiff == 3*titers), "!"); // INT(ln2) = 0 and INT(pi) = 3, summed (titers) times clock2 = clock(); tdiff = (double)(clock2 - clock1); cycles = tdiff*CPU_FREQUENCY/CLOCKS_PER_SEC; cycles /= (double)titers; printf ("qfint : cycles/operation = %10.2f\n",cycles - cycles_for_qfdbl); #endif - i128 = qfnint(QHALF); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)1),"ERROR 82 in qfloat.c"); - i128 = qfint(QHALF); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)0),"ERROR 83 in qfloat.c"); - i128 = qfnint(QEXP); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)3),"ERROR 84 in qfloat.c"); - i128 = qfint(QEXP); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)2),"ERROR 85 in qfloat.c"); - i128 = qfnint(Q2PI); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)6),"ERROR 86 in qfloat.c"); - i128 = qfint(Q2PI); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)6),"ERROR 87 in qfloat.c"); + i128 = qfnint(QHALF); ASSERT((!i128.d1 && i128.d0 == (uint64)1),"ERROR 82 in qfloat.c"); + i128 = qfint(QHALF); ASSERT((!i128.d1 && i128.d0 == (uint64)0),"ERROR 83 in qfloat.c"); + i128 = qfnint(QEXP); ASSERT((!i128.d1 && i128.d0 == (uint64)3),"ERROR 84 in qfloat.c"); + i128 = qfint(QEXP); ASSERT((!i128.d1 && i128.d0 == (uint64)2),"ERROR 85 in qfloat.c"); + i128 = qfnint(Q2PI); ASSERT((!i128.d1 && i128.d0 == (uint64)6),"ERROR 86 in qfloat.c"); + i128 = qfint(Q2PI); ASSERT((!i128.d1 && i128.d0 == (uint64)6),"ERROR 87 in qfloat.c"); q = qfmul_pow2(Q2PI, 20); - i128 = qfnint(q); ASSERT(HERE, (!i128.d1 && i128.d0 == (uint64)6588397),"ERROR 90 in qfloat.c"); + i128 = qfnint(q); ASSERT((!i128.d1 && i128.d0 == (uint64)6588397),"ERROR 90 in qfloat.c"); q = qfmul_pow2(QPI, 125); /* This gives pi*2^125, which should still fit into a signed 128-bit int. */ i128 = qfnint(q); - ASSERT(HERE, (i128.d1 = (uint64)0x6487ED5110B4611Aull && i128.d0 == (uint64)0x62633145C06E1000ull),"ERROR 92 in qfloat.c"); + ASSERT((i128.d1 = (uint64)0x6487ED5110B4611Aull && i128.d0 == (uint64)0x62633145C06E1000ull),"ERROR 92 in qfloat.c"); #if TIMING_TEST exit(0); diff --git a/src/qfloat.h b/src/qfloat.h index f6997c3c..9fedd712 100755 --- a/src/qfloat.h +++ b/src/qfloat.h @@ -206,8 +206,8 @@ struct qfloat qfcos_or_sin1(struct qfloat q, int cos_or_sin); #define QLSHIFT(__x, __n, __y)\ {\ /* Make sure sign/exp fields have been cleared and shift count >= 0: */\ - ASSERT(HERE, (__x.hi>>52) == 0,"QLSHIFT: sign/exp fields not zero!");\ - ASSERT(HERE, (int64)__n >= 0,"QLSHIFT: (int64)__n >= 0");\ + ASSERT((__x.hi>>52) == 0,"QLSHIFT: sign/exp fields not zero!");\ + ASSERT((int64)__n >= 0,"QLSHIFT: (int64)__n >= 0");\ /* Need to handle zero shift count separately: */\ if(__n == 0)\ {\ @@ -230,15 +230,15 @@ struct qfloat qfcos_or_sin1(struct qfloat q, int cos_or_sin); __y.lo = (uint64)0;\ }\ /* Make sure exp field at most 1 after shift: */\ - ASSERT(HERE, (__x.hi>>52) <= 1,"QLSHIFT: exp field out of range on output!");\ + ASSERT((__x.hi>>52) <= 1,"QLSHIFT: exp field out of range on output!");\ } /* (Logical) Right-shift: */ #define QRSHIFT(__x, __n, __y)\ {\ /* Make sure sign/exp fields have been cleared and shift count >= 0: */\ - ASSERT(HERE, (__x.hi>>52) == 0,"QRSHIFT: sign/exp fields not zero!");\ - ASSERT(HERE, (int64)(__n) >= 0,"QRSHIFT: (int64)(__n) >= 0 !");\ + ASSERT((__x.hi>>52) == 0,"QRSHIFT: sign/exp fields not zero!");\ + ASSERT((int64)(__n) >= 0,"QRSHIFT: (int64)(__n) >= 0 !");\ /* Need to handle zero shift count separately: */\ if((__n) == 0)\ {\ diff --git a/src/radix1008_ditN_cy_dif1.c b/src/radix1008_ditN_cy_dif1.c index 3961c967..8bb5796b 100755 --- a/src/radix1008_ditN_cy_dif1.c +++ b/src/radix1008_ditN_cy_dif1.c @@ -422,11 +422,11 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -465,7 +465,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -475,7 +475,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -518,24 +518,24 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix1008_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix1008_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -568,13 +568,13 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset1008 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT(half_arr_offset1008 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { j = (1<<(2*(L2_SZ_VD-2))) + 4; // 16+4 for sse2, 64+4 for avx } else { j = ODD_RADIX<<2; // 4*ODD_RADIX } - ASSERT(HERE, (radix1008_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix1008_creals_in_local_store checksum failed!"); + ASSERT((radix1008_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix1008_creals_in_local_store checksum failed!"); // Roots for radix-16 DFTs: VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one , 1.0 ); @@ -651,7 +651,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1025,12 +1025,12 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1065,7 +1065,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1116,7 +1116,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ break; }; } // printf("wts_idx_incr = %u\n",wts_idx_incr); - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1550,8 +1550,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1561,8 +1561,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1571,26 +1571,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1600,8 +1600,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1822,7 +1822,7 @@ for(outer=0; outer <= 1; outer++) /*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/ for(j = 0; j < main_work_units; ++j) { - ASSERT(HERE, 0x0 == cy1008_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy1008_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1832,7 +1832,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2632,8 +2632,8 @@ void radix1008_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2697,23 +2697,23 @@ void radix1008_dit_pass1(double a[], int n) half_arr= tmp + 0x02; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix1008_main_carry_loop.h b/src/radix1008_main_carry_loop.h index 11303f91..d3927fa3 100755 --- a/src/radix1008_main_carry_loop.h +++ b/src/radix1008_main_carry_loop.h @@ -229,7 +229,7 @@ for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */ // (and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1). // *But*: since the init macro does an on-the-fly version of this between j,j+2 portions, external code co2=co3 must come *after* both ctmp-data octets are inited. #ifdef USE_AVX512 - ASSERT(HERE, 0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!"); + ASSERT(0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!"); #endif AVX_cmplx_carry_fast_wtsinit_X8(add1,add2,add3, itmp, half_arr,sign_mask, n_minus_sil,n_minus_silp1,sinwt,sinwtm1, sse_bw,sse_n) diff --git a/src/radix1024_ditN_cy_dif1.c b/src/radix1024_ditN_cy_dif1.c index 70206319..5ecf1504 100755 --- a/src/radix1024_ditN_cy_dif1.c +++ b/src/radix1024_ditN_cy_dif1.c @@ -221,7 +221,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ // [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex) // to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros: struct complex t[RADIX], *tptr; - double *addr,*addi; + const double *addr,*addi; int *itmp,*itm2; // Pointer into the bjmodn array int err; static int first_entry=TRUE; @@ -375,11 +375,11 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -418,7 +418,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -428,7 +428,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -471,22 +471,22 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage cslots_in_local_store = radix1024_creals_in_local_store + (20+RADIX/2)/2; // Just add enough int64 space for both cases, plus some - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix1024_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -520,8 +520,8 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset1024 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix1024_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix1024_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset1024 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix1024_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix1024_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(isrt2,ISRT2); @@ -672,7 +672,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1194,14 +1194,14 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < NDIVR/CY_THREADS; j++) @@ -1221,7 +1221,7 @@ int radix1024_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); if(CY_THREADS > 1) { for(ithread = 1; ithread < CY_THREADS; ithread++) @@ -1400,8 +1400,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1411,8 +1411,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1421,20 +1421,20 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; - ASSERT(HERE, ((tmp + 0x1080)->d0 == ISRT2 && (tmp + 0x1080)->d1 == ISRT2), "thread-local memcheck failed!"); + ASSERT(((tmp + 0x1080)->d0 == ISRT2 && (tmp + 0x1080)->d1 == ISRT2), "thread-local memcheck failed!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif @@ -1444,11 +1444,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1461,8 +1461,8 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_SSE2) // This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots: - dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif // scale gets set immediately prior to calling carry macro, hence no use checking it here. /* init carries */ @@ -1685,7 +1685,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy1024_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy1024_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1695,7 +1695,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("%s end ; #tasks = %d, #free_tasks = %d\n",func, tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1869,7 +1869,7 @@ void radix1024_dif_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n >> 10; p1 = NDIVR; @@ -2605,7 +2605,7 @@ void radix1024_dit_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n >> 10; p1 = NDIVR; @@ -3214,8 +3214,8 @@ void radix1024_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -3442,11 +3442,11 @@ void radix1024_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -3456,18 +3456,18 @@ void radix1024_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { #ifdef USE_AVX512 /* No-Op */ #else - dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } diff --git a/src/radix1024_main_carry_loop.h b/src/radix1024_main_carry_loop.h index 761bd6e6..8f07752b 100755 --- a/src/radix1024_main_carry_loop.h +++ b/src/radix1024_main_carry_loop.h @@ -168,8 +168,8 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee // In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass: if(target_idx == j) { #ifdef USE_SSE2 - addr = (double *)s1p00 + target_set; - *addr += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor + double *addr_ = (double *)s1p00 + target_set; + *addr_ += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor #else // target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum: l = target_set&1; target_set >>= 1; @@ -470,26 +470,28 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) { // LOACC with tunable DWT-weights chaining /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy_r; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy_r; for(ntmp = 0; ntmp < RADIX>>2; ntmp++) { jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp]; // poff[] = p04,08,... // Re-init weights every 4th macro invocation to keep errors under control: - cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } else { // HiACC: /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy_r; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy_r; for(ntmp = 0; ntmp < RADIX>>2; ntmp++) { jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp]; // poff[] = p04,08,... - cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } // LOACC or HIACC? @@ -683,13 +685,14 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee #else // Scalar-double mode: // Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2): - ntmp = 0; addr = cy_r; addi = cy_i; + ntmp = 0; + double *addr_ = cy_r, *addi_ = cy_i; for(m = 0; m < RADIX>>2; m++) { jt = j1 + poff[m]; jp = j2 + poff[m]; - fermat_carry_norm_pow2_errcheck(a[jt ],a[jp ],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p1],a[jp+p1],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p2],a[jp+p2],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p3],a[jp+p3],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; + fermat_carry_norm_pow2_errcheck(a[jt ],a[jp ],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p1],a[jp+p1],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p2],a[jp+p2],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p3],a[jp+p3],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; } #endif /* #ifdef USE_SSE2 */ diff --git a/src/radix128_ditN_cy_dif1.c b/src/radix128_ditN_cy_dif1.c index 76e45306..5faef1a7 100755 --- a/src/radix128_ditN_cy_dif1.c +++ b/src/radix128_ditN_cy_dif1.c @@ -223,7 +223,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // Local storage: We must use an array here because scalars have no guarantees about relative address offsets // [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex) // to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros: - double *addr, *addi; + const double *addr, *addi; struct complex t[RADIX], *tptr; int *itmp,*itm2; // Pointer into the bjmodn array int err; @@ -383,11 +383,11 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -426,7 +426,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -436,7 +436,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -479,23 +479,23 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread cslots_in_local_store = radix128_creals_in_local_store + (20+RADIX/2)/2; // Just add enough int64 space for both cases, plus some - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix128_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -604,8 +604,8 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif -// ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix128_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix128_creals_in_local_store checksum failed!"); +// ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix128_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix128_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one , 1.0 ); @@ -753,7 +753,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1168,14 +1168,14 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < NDIVR/CY_THREADS; j++) @@ -1195,7 +1195,7 @@ int radix128_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); if(CY_THREADS > 1) { for(ithread = 1; ithread < CY_THREADS; ithread++) @@ -1374,8 +1374,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1385,8 +1385,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1395,19 +1395,19 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif @@ -1417,11 +1417,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1434,8 +1434,8 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_SSE2) // This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots: - dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif // scale gets set immediately prior to calling carry macro, hence no use checking it here. /* init carries */ @@ -1658,7 +1658,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy128_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy128_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1668,7 +1668,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("%s end ; #tasks = %d, #free_tasks = %d\n",func, tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -2676,8 +2676,8 @@ void radix128_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2823,13 +2823,13 @@ void radix128_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); // Must make this check 'fuzzy' to allow for wrong-way-round experiments: - ASSERT(HERE, (fabs(isrt2->d0 - ISRT2) < EPS && fabs(isrt2->d1 - ISRT2) < EPS), "thread-local memcheck failed!"); + ASSERT((fabs(isrt2->d0 - ISRT2) < EPS && fabs(isrt2->d1 - ISRT2) < EPS), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2839,18 +2839,18 @@ void radix128_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { #ifdef USE_AVX512 /* No-Op */ #else - dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } diff --git a/src/radix128_main_carry_loop.h b/src/radix128_main_carry_loop.h index 8a6b1df7..8a813bb6 100755 --- a/src/radix128_main_carry_loop.h +++ b/src/radix128_main_carry_loop.h @@ -229,8 +229,8 @@ as are the index offsets of each sets of complex outputs in the A-array: [jt,jp] // In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass: if(target_idx == j) { #ifdef USE_SSE2 - addr = (double *)s1p00 + target_set; - *addr += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor + double *addr_ = (double *)s1p00 + target_set; + *addr_ += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor #else // target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum: l = target_set&1; target_set >>= 1; @@ -531,26 +531,28 @@ as are the index offsets of each sets of complex outputs in the A-array: [jt,jp] if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) { // LOACC with tunable DWT-weights chaining /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy_r; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy_r; for(ntmp = 0; ntmp < RADIX>>2; ntmp++) { jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp]; // poff[] = p04,08,... // Re-init weights every 4th macro invocation to keep errors under control: - cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } else { // HiACC: /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy_r; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy_r; for(ntmp = 0; ntmp < RADIX>>2; ntmp++) { jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp]; // poff[] = p04,08,... - cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } // LOACC or HIACC? @@ -744,13 +746,14 @@ as are the index offsets of each sets of complex outputs in the A-array: [jt,jp] #else // Scalar-double mode: // Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2); - ntmp = 0; addr = cy_r; addi = cy_i; + ntmp = 0; + double *addr_ = cy_r, *addi_ = cy_i; for(m = 0; m < RADIX>>2; m++) { jt = j1 + poff[m]; jp = j2 + poff[m]; // poff[] = p04,08,... - fermat_carry_norm_pow2_errcheck(a[jt ],a[jp ],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; + fermat_carry_norm_pow2_errcheck(a[jt ],a[jp ],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; } #endif /* #ifdef USE_SSE2 */ diff --git a/src/radix12_ditN_cy_dif1.c b/src/radix12_ditN_cy_dif1.c index 5e6866b6..bca6532c 100755 --- a/src/radix12_ditN_cy_dif1.c +++ b/src/radix12_ditN_cy_dif1.c @@ -253,7 +253,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "radix12_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "radix12_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -312,11 +312,11 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -355,7 +355,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -365,7 +365,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -401,18 +401,18 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix12_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix12_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 48 16-byte slots of sc_arr for temporaries, next 2 for the nontrivial complex roots, next 6 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -819,12 +819,12 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy10 = (double *)malloc(j); ptr_prod += (uint32)(_cy10== 0x0); _cy11 = (double *)malloc(j); ptr_prod += (uint32)(_cy11== 0x0); - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix12_ditN_cy_dif1."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix12_ditN_cy_dif1."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/20-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix12_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix12_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -848,7 +848,7 @@ int radix12_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); first_entry=FALSE; } /* endif(first_entry) */ @@ -1002,8 +1002,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1013,8 +1013,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1023,20 +1023,20 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif @@ -1196,7 +1196,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy12_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy12_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1206,7 +1206,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -1259,7 +1259,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,"radix20_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,"radix20_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ _cy00[ithread] = _cy00[ithread-1]; _cy01[ithread] = _cy01[ithread-1]; _cy02[ithread] = _cy02[ithread-1]; @@ -1888,8 +1888,8 @@ void radix12_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -1942,17 +1942,17 @@ void radix12_dit_pass1(double a[], int n) r11 = r00 + 0x16; s1p11 = tmp + 0x16; half_arr= tmp + 0x23; /* This table needs 20x16 bytes */ // half_arr = r00 + 0x3b; This is where the value of half_arr_offset12 comes from #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); tmp = half_arr; #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix144_ditN_cy_dif1.c b/src/radix144_ditN_cy_dif1.c index 885298e2..eb498609 100755 --- a/src/radix144_ditN_cy_dif1.c +++ b/src/radix144_ditN_cy_dif1.c @@ -334,7 +334,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -402,11 +402,11 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -444,7 +444,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -454,7 +454,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -494,24 +494,24 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix144_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix144_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix144_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -552,7 +552,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] sse2_rnd= tmp + 0x01; // sc_ptr += 2 = 0x296; This is where the value of half_arr_offset144 comes from half_arr= tmp + 0x02; // This table needs 32 x 16 bytes in SSE2 mode #endif - ASSERT(HERE, (radix144_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!"); + ASSERT((radix144_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); #if 0 // Here this trick actually degrades accuracy ... must be interaction with the radix-9 DFTs of some kind @@ -951,12 +951,12 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -980,7 +980,7 @@ int radix144_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1132,8 +1132,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1143,8 +1143,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1153,26 +1153,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1305,7 +1305,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy144_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy144_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1315,7 +1315,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2018,8 +2018,8 @@ void radix144_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2134,20 +2134,20 @@ void radix144_dit_pass1(double a[], int n) half_arr= tmp + 0x02; // This table needs 20 x 16 bytes in SSE2 mode #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix160_ditN_cy_dif1.c b/src/radix160_ditN_cy_dif1.c index 138a659a..791a96e6 100755 --- a/src/radix160_ditN_cy_dif1.c +++ b/src/radix160_ditN_cy_dif1.c @@ -304,7 +304,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -372,11 +372,11 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -414,7 +414,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -424,7 +424,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -464,24 +464,24 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix160_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix160_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix160_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -521,7 +521,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] sse2_rnd= tmp + 0x01; // sc_ptr += 0x(290 + 50 + 2) = 0x2e2; This is where the value of half_arr_offset160 comes from half_arr= tmp + 0x02; #endif - ASSERT(HERE, (radix160_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix160_creals_in_local_store checksum failed!"); + ASSERT((radix160_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix160_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); #if 1 @@ -1308,12 +1308,12 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1337,7 +1337,7 @@ int radix160_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1489,8 +1489,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1500,8 +1500,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1510,26 +1510,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1659,7 +1659,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy160_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy160_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1669,7 +1669,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2529,8 +2529,8 @@ void radix160_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -3024,21 +3024,21 @@ void radix160_dit_pass1(double a[], int n) half_arr= tmp + 0x02; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix16_dif_dit_pass.c b/src/radix16_dif_dit_pass.c index 4b96f6ee..8e1f6095 100755 --- a/src/radix16_dif_dit_pass.c +++ b/src/radix16_dif_dit_pass.c @@ -210,17 +210,17 @@ void radix16_dif_pass (double a[], int n, struct complex rt0[], stru // fprintf(stderr, "radix16_dif_dit_pass pfetch_dist = %d\n", pfetch_dist); max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); if(sc_arr != 0x0) { // Have previously-malloc'ed local storage free((void *)sc_arr); sc_arr=0x0; } // v19 alloc'ed 72* ... v20 needs [1+1+4+8] = 18 more slots in SSE2 mode, [1+1+2+4] = 8 more in AVX/AVX2 mode, [1+1+1+2] = 5 more in AVX-512 mode, // just use 20 more slots in all cases for simplicity's sake. Further add 12 slots for doubled-into-vectors 6-term Chebyshev expansions of cos, sin: - sc_arr = ALLOC_VEC_DBL(sc_arr, 104*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 104*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots, last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array. @@ -332,7 +332,7 @@ void radix16_dif_pass (double a[], int n, struct complex rt0[], stru /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); dtmp = (double)12.56637061435917295376/n; // twopin = 2*pi/[complex FFT length] = 2*pi/(n/2) = 4*pi/n r1 = __r0 + thr_id*104; isrt2 = r1 + 0x20; @@ -403,7 +403,7 @@ void radix16_dif_pass (double a[], int n, struct complex rt0[], stru encounter the same sets or index strides (albeit in opposite order), can split such tests between them: *** 2014: Failure of this assertion led me to find dependence on it in my new AVX2/FMA-based DIT macro *** *** [But fix obviates said dependence, so no longer appropriate to enforce it.] *** - ASSERT(HERE, p2 == p1+p1, "radix16_dif_pass: p2 != p1+p1!"); + ASSERT(p2 == p1+p1, "radix16_dif_pass: p2 != p1+p1!"); */ iroot_prim=(incr >> 5); /* (incr/2)/radix_now */ for(m=0; m < nloops; m++) /* NLOOPS may range from 1 (if first pass radix = 16) to P*N/32 (last pass radix = 16). */ @@ -543,8 +543,8 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou // Loop to test various fast alternatives to j/(n>>4) for every j < n/2: for(j = 0; j < (n>>1); j++) { // This fails for e.g. j = 393205 and (n>>4) = 393216: - // ASSERT(HERE, __MULH32(j,imult) == j/(n>>4), "umulh32(j,imult) != j/(n>>4)"); - ASSERT(HERE, (int)((float)j*fndiv16) == j/(n>>4), "(float)j*fndiv16 != j/(n>>4)"); + // ASSERT(__MULH32(j,imult) == j/(n>>4), "umulh32(j,imult) != j/(n>>4)"); + ASSERT((int)((float)j*fndiv16) == j/(n>>4), "(float)j*fndiv16 != j/(n>>4)"); } i = 163397; const double pi4_dbl = (double)0.78539816339744830961, twopin_dbl = 16*pi4_dbl/n; @@ -556,7 +556,7 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou scos[0] = cos(gamma[j]); scos[1] = sin(gamma[j]); is0[j] = ((unsigned int)io[j] - 2) < 4; is1[j] = (io[j] > 3); jj[j] = IS_ODD((io[j]+1)>>1); - ASSERT(HERE, (int)ff[j] == io[j], "ff != io error!"); + ASSERT((int)ff[j] == io[j], "ff != io error!"); twiddle[j].re = sign[is0[j]]*scos[jj[j]]; twiddle[j].im = sign[is1[j]]*scos[jj[j]^1]; } #endif @@ -988,7 +988,7 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou *add2++ = rt; // cF, will get multiplied by 1/c7 to yield __cF7 // This places us at add0 == c8 and add1 = c12. - ASSERT(HERE, add0 == (double *)cc0+16 && add1 == (double *)cc0+32 && add2 == (double *)cc0+44, "add0,1,2 checksum failed in AVX2 sincos inits!"); + ASSERT(add0 == (double *)cc0+16 && add1 == (double *)cc0+32 && add2 == (double *)cc0+44, "add0,1,2 checksum failed in AVX2 sincos inits!"); /* At this point, the 11 ymm-sized [32-byte] chunks starting at &cc0 contain the following scalar-double data: @@ -1621,7 +1621,7 @@ notation below is low-to-high-[byte|word] within xmm-regs; '|' denotes dword bou addr += p1; prefetch_p_doubles(addr); #endif - /* Debug: check for overflow of + terms: */ ASSERT(HERE, m1+m17 >= m1 && m2+m18 >= m2,"Overflow of [0,8b] term!"); + /* Debug: check for overflow of + terms: */ ASSERT(m1+m17 >= m1 && m2+m18 >= m2,"Overflow of [0,8b] term!"); a[jt ]= t1+t17; a[jp ]= t2+t18; b[jt ]=qreduce( m1+m17 ); b[jp ]=qreduce( m2+m18 ); // + terms in 0,8b a[jt+p1 ]= t1-t17; a[jp+p1 ]= t2-t18; b[jt+p1 ]=qreduce( m1-m17+q4); b[jp+p1 ]=qreduce( m2-m18+q4); // - terms in -4b,4b // mpy by E^4=i is inlined here: @@ -2010,7 +2010,7 @@ void radix16_dit_pass (double a[], int n, struct complex rt0[], stru #endif #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif #ifdef USE_SSE2 @@ -2023,15 +2023,15 @@ void radix16_dit_pass (double a[], int n, struct complex rt0[], stru { max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); if(sc_arr != 0x0) { // Have previously-malloc'ed local storage free((void *)sc_arr); sc_arr=0x0; } - sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots, last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array. @@ -2083,7 +2083,7 @@ void radix16_dit_pass (double a[], int n, struct complex rt0[], stru /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); r1 = __r0 + thr_id*72; isrt2 = r1 + 0x20; cc0 = r1 + 0x21; @@ -2108,9 +2108,9 @@ void radix16_dit_pass (double a[], int n, struct complex rt0[], stru // body (both C and ASM). Since such checks may be runlength-dependent, need to be cheap enough to leave on // all the time, as here where we do them just once prior to entering the processing loop. Since DIF and DIT // encounter the same sets or index strides (albeit in opposite order), can split such tests between them: - ASSERT(HERE, p4 == p2+p2, "radix16_dit_pass: p4 != p2+p2!"); - ASSERT(HERE, p8 == p4+p4, "radix16_dit_pass: p8 != p4+p4!"); - ASSERT(HERE, p12 == p4+p8, "radix16_dit_pass: p12 != p4+p8!"); + ASSERT(p4 == p2+p2, "radix16_dit_pass: p4 != p2+p2!"); + ASSERT(p8 == p4+p4, "radix16_dit_pass: p8 != p4+p4!"); + ASSERT(p12 == p4+p8, "radix16_dit_pass: p12 != p4+p8!"); iroot_prim=(incr >> 5); /* (incr/2)/radix_now */ @@ -2398,7 +2398,7 @@ void radix16_dit_pass (double a[], int n, struct complex rt0[], stru *add1++ = it; // s15 slot will hold __rF = s15/c15 // This places us at add0 == c8 and add1 = c12. - ASSERT(HERE, add0 == (double *)cc0+16 && add1 == (double *)cc0+32, "add0,1 checksum failed in AVX2 DIT sincos inits!"); + ASSERT(add0 == (double *)cc0+16 && add1 == (double *)cc0+32, "add0,1 checksum failed in AVX2 DIT sincos inits!"); /* At this point, the 8 ymm-sized [32-byte] chunks starting at &cc0 contain the following scalar-double data: @@ -2420,9 +2420,9 @@ void radix16_dit_pass (double a[], int n, struct complex rt0[], stru add0[0x00] = c; add0[0x10] = tan; add0[0x20] = 1.0; - // ASSERT(HERE, *(add0-1) == ISRT2, "Scalar ISRT2 bad!"); + // ASSERT(*(add0-1) == ISRT2, "Scalar ISRT2 bad!"); c_tmp = cc0 + 0x22; // 1.0 x 4 - // ASSERT(HERE, c_tmp->d0 == 1.0 && c_tmp->d0 == c_tmp->d1 && c_tmp->d0 == c_tmp->d2 && c_tmp->d0 == c_tmp->d3, "1.0 x 4 mismatch!"); + // ASSERT(c_tmp->d0 == 1.0 && c_tmp->d0 == c_tmp->d1 && c_tmp->d0 == c_tmp->d2 && c_tmp->d0 == c_tmp->d3, "1.0 x 4 mismatch!"); /* Scalar data starting at add0 = cc0 now laid out as below: diff --git a/src/radix16_dif_dit_pass_asm.h b/src/radix16_dif_dit_pass_asm.h index 4ea15c1b..75d3ab38 100755 --- a/src/radix16_dif_dit_pass_asm.h +++ b/src/radix16_dif_dit_pass_asm.h @@ -49,7 +49,7 @@ The workaround is to use -O1 or higher, whether one is building a debuggable bin {\ double *add0,*add1,*add2;\ add0 = (double *)__twid_ptr; /* add0 points to 16 cos-data-to-be-inverted; Need a double-ptr on lhs here */\ - ASSERT(HERE, add0 != 0x0, "Null add0 pointer!");\ + ASSERT(add0 != 0x0, "Null add0 pointer!");\ add1 = add0 + 16; /* add1 points to block of memory temporarily used to store the corresponding sine data */\ add2 = add0 + 32; /* add2 points to block of memory temporarily used to store the 11 [0-padded to 12] cosine data which need to be divided by other cosines (i.e. multiplied by inverses) */\ @@ -134,7 +134,7 @@ The workaround is to use -O1 or higher, whether one is building a debuggable bin *add2++ = __cF; /* cF, will get multiplied by 1/c7 to yield __cF7 */\ \ /* This places us at add0 == c8 and add1 = c12. */\ - ASSERT(HERE, add0 == (double *)__twid_ptr+16 && add1 == (double *)__twid_ptr+32 && add2 == (double *)__twid_ptr+44, "add0,1,2 checksum failed in AVX2 sincos inits!");\ + ASSERT(add0 == (double *)__twid_ptr+16 && add1 == (double *)__twid_ptr+32 && add2 == (double *)__twid_ptr+44, "add0,1,2 checksum failed in AVX2 sincos inits!");\ /* At this point, the 11 ymm-sized [32-byte] chunks starting at &__twid_ptr contain the following scalar-double data: diff --git a/src/radix16_ditN_cy_dif1.c b/src/radix16_ditN_cy_dif1.c index 0aa9da4a..804fc52f 100755 --- a/src/radix16_ditN_cy_dif1.c +++ b/src/radix16_ditN_cy_dif1.c @@ -409,7 +409,7 @@ int radix16_ditN_cy_dif1 (double a[], int n, int nwt, int nwt_bits, if(k > 60) k -= 61; } } - ASSERT(HERE, isPow2(N2), "N/2 not a power of 2!"); + ASSERT(isPow2(N2), "N/2 not a power of 2!"); l2_n2 = trailz32(N2); // ******* For carry step, also need the 16 values of bimodnmod61 for i = j*(n/radix0), j = 0,...,15 ************ #endif @@ -439,11 +439,11 @@ int radix16_ditN_cy_dif1 (double a[], int n, int nwt, int nwt_bits, qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -482,7 +482,7 @@ int radix16_ditN_cy_dif1 (double a[], int n, int nwt, int nwt_bits, if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -493,7 +493,7 @@ int radix16_ditN_cy_dif1 (double a[], int n, int nwt, int nwt_bits, main_work_units = 0; pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -537,18 +537,18 @@ int radix16_ditN_cy_dif1 (double a[], int n, int nwt, int nwt_bits, #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix16_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix16_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots, next 16 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -685,7 +685,7 @@ int radix16_ditN_cy_dif1 (double a[], int n, int nwt, int nwt_bits, // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; - printf("j = %3u: cos[j*Pi/2] = 0x%16llX, sin[j*Pi/2] = 0x%16llX\n",j,qfdbl_as_uint64(qx),qfdbl_as_uint64(qy)); + printf("j = %3u: cos[j*Pi/2] = %#16" PRIX64 ", sin[j*Pi/2] = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx),qfdbl_as_uint64(qy)); } exit(0); #endif @@ -1207,14 +1207,14 @@ half_arr+5*radix radix [LOACC-only] inv_mult-lut _cy_iE = (double *)malloc(j); ptr_prod += (uint32)(_cy_iE== 0x0); _cy_iF = (double *)malloc(j); ptr_prod += (uint32)(_cy_iF== 0x0); - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/16-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < NDIVR/CY_THREADS; j++) @@ -1235,7 +1235,7 @@ half_arr+5*radix radix [LOACC-only] inv_mult-lut { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); } first_entry=FALSE; @@ -1415,8 +1415,8 @@ for(outer=0; outer <= 1; outer++) for(ithread = 0; ithread < CY_THREADS; ithread++) { // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1426,8 +1426,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1439,22 +1439,22 @@ for(outer=0; outer <= 1; outer++) // on successive calls, so set here at runtime rather than in init-only block: tdat[ithread].arrdat = a; /* Main data array */ #ifdef USE_FGT61 - ASSERT(HERE, tdat[ithread].brrdat == b, "thread-local memcheck fail!"); /* Modular version of main data array */ + ASSERT(tdat[ithread].brrdat == b, "thread-local memcheck fail!"); /* Modular version of main data array */ #endif - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; - ASSERT(HERE, ((tmp + 0x20)->d0 == ISRT2 && (tmp + 0x20)->d1 == ISRT2), "thread-local memcheck failed!"); + ASSERT(((tmp + 0x20)->d0 == ISRT2 && (tmp + 0x20)->d1 == ISRT2), "thread-local memcheck failed!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif @@ -1464,11 +1464,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif tdat[ithread].bjmodn0 = _bjmodn0[ithread]; tdat[ithread].bjmodn1 = _bjmodn1[ithread]; @@ -1510,8 +1510,8 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_SSE2) // This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots: - dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); // scale gets set immediately prior to calling carry macro, hence no use checking it here. #endif /* init carries */ @@ -1858,7 +1858,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy16_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy16_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1868,7 +1868,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix16_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1957,7 +1957,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,""); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,""); /* Make sure loop only gets executed if multiple threads */ _cy_r0[ithread] = _cy_r0[ithread-1]; _cy_r1[ithread] = _cy_r1[ithread-1]; _cy_r2[ithread] = _cy_r2[ithread-1]; @@ -2018,7 +2018,7 @@ for(outer=0; outer <= 1; outer++) // Must use NDIVR instead of p1 here since p1 may have pads which are not applied to element-2-slots-before j1 = NDIVR-2; j1 += ( (j1 >> DAT_BITS) << PAD_BITS ); j2 = j1+RE_IM_STRIDE; - ASSERT(HERE, t31 <= 1.0 && t32 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!"); + ASSERT(t31 <= 1.0 && t32 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!"); // Undo the initial dif pass just for the 16 complex terms in question: RADIX_16_DIT(a[j1],a[j2],a[j1+p1 ],a[j2+p1 ],a[j1+p2 ],a[j2+p2 ],a[j1+p3 ],a[j2+p3 ],a[j1+p4 ],a[j2+p4 ],a[j1+p5 ],a[j2+p5 ],a[j1+p6 ],a[j2+p6 ],a[j1+p7 ],a[j2+p7 ],a[j1+p8 ],a[j2+p8 ],a[j1+p9 ],a[j2+p9 ],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15] ,a[j1],a[j2],a[j1+p1 ],a[j2+p1 ],a[j1+p2 ],a[j2+p2 ],a[j1+p3 ],a[j2+p3 ],a[j1+p4 ],a[j2+p4 ],a[j1+p5 ],a[j2+p5 ],a[j1+p6 ],a[j2+p6 ],a[j1+p7 ],a[j2+p7 ],a[j1+p8 ],a[j2+p8 ],a[j1+p9 ],a[j2+p9 ],a[j1+p10],a[j2+p10],a[j1+p11],a[j2+p11],a[j1+p12],a[j2+p12],a[j1+p13],a[j2+p13],a[j1+p14],a[j2+p14],a[j1+p15],a[j2+p15] @@ -2043,11 +2043,11 @@ for(outer=0; outer <= 1; outer++) // Verify that any cyout = 1 has the corresponding high word < 0, // then absorb cyout back into the high word and zero the carry: if(t31 == 1.0) { - ASSERT(HERE, a[j1+p15] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!"); + ASSERT(a[j1+p15] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!"); a[j1+p15] += FFT_MUL_BASE; t31 = 0.0; } if(t32 == 1.0) { - ASSERT(HERE, a[j2+p15] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!"); + ASSERT(a[j2+p15] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!"); a[j2+p15] += FFT_MUL_BASE; t32 = 0.0; } // Redo the initial dif pass just for the 16 complex terms in question: @@ -2058,7 +2058,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,""); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,""); /* Make sure loop only gets executed if multiple threads */ _cy_r0[ithread] = _cy_r0[ithread-1]; _cy_i0[ithread] = _cy_i0[ithread-1]; _cy_r1[ithread] = _cy_r1[ithread-1]; _cy_i1[ithread] = _cy_i1[ithread-1]; _cy_r2[ithread] = _cy_r2[ithread-1]; _cy_i2[ithread] = _cy_i2[ithread-1]; @@ -2115,22 +2115,22 @@ for(outer=0; outer <= 1; outer++) #ifdef USE_FGT61 if(!j) { printf("J = 0, wraparound INputs:\n"); - printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2 ],a[j2 +1], b[j2 ],b[j2 +1]); - printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p1 ],a[j2+p1 +1], b[j2+p1 ],b[j2+p1 +1]); - printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p2 ],a[j2+p2 +1], b[j2+p2 ],b[j2+p2 +1]); - printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p3 ],a[j2+p3 +1], b[j2+p3 ],b[j2+p3 +1]); - printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p4 ],a[j2+p4 +1], b[j2+p4 ],b[j2+p4 +1]); - printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p5 ],a[j2+p5 +1], b[j2+p5 ],b[j2+p5 +1]); - printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p6 ],a[j2+p6 +1], b[j2+p6 ],b[j2+p6 +1]); - printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p7 ],a[j2+p7 +1], b[j2+p7 ],b[j2+p7 +1]); - printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p8 ],a[j2+p8 +1], b[j2+p8 ],b[j2+p8 +1]); - printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p9 ],a[j2+p9 +1], b[j2+p9 ],b[j2+p9 +1]); - printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p10],a[j2+p10+1], b[j2+p10],b[j2+p10+1]); - printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p11],a[j2+p11+1], b[j2+p11],b[j2+p11+1]); - printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p12],a[j2+p12+1], b[j2+p12],b[j2+p12+1]); - printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p13],a[j2+p13+1], b[j2+p13],b[j2+p13+1]); - printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p14],a[j2+p14+1], b[j2+p14],b[j2+p14+1]); - printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p15],a[j2+p15+1], b[j2+p15],b[j2+p15+1]); + printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2 ],a[j2 +1], b[j2 ],b[j2 +1]); + printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p1 ],a[j2+p1 +1], b[j2+p1 ],b[j2+p1 +1]); + printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p2 ],a[j2+p2 +1], b[j2+p2 ],b[j2+p2 +1]); + printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p3 ],a[j2+p3 +1], b[j2+p3 ],b[j2+p3 +1]); + printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p4 ],a[j2+p4 +1], b[j2+p4 ],b[j2+p4 +1]); + printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p5 ],a[j2+p5 +1], b[j2+p5 ],b[j2+p5 +1]); + printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p6 ],a[j2+p6 +1], b[j2+p6 ],b[j2+p6 +1]); + printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p7 ],a[j2+p7 +1], b[j2+p7 ],b[j2+p7 +1]); + printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p8 ],a[j2+p8 +1], b[j2+p8 ],b[j2+p8 +1]); + printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p9 ],a[j2+p9 +1], b[j2+p9 ],b[j2+p9 +1]); + printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p10],a[j2+p10+1], b[j2+p10],b[j2+p10+1]); + printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p11],a[j2+p11+1], b[j2+p11],b[j2+p11+1]); + printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p12],a[j2+p12+1], b[j2+p12],b[j2+p12+1]); + printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p13],a[j2+p13+1], b[j2+p13],b[j2+p13+1]); + printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p14],a[j2+p14+1], b[j2+p14],b[j2+p14+1]); + printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p15],a[j2+p15+1], b[j2+p15],b[j2+p15+1]); } #endif a[j2 ] *= radix_inv; @@ -2169,22 +2169,22 @@ for(outer=0; outer <= 1; outer++) b[j2+p15] = mul_pow2_modq( b[j2+p15], 57); if(j==1) { printf("J = 0, wraparound OUTputs:\n"); - printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2 -1],a[j2 ], b[j2 -1],b[j2 ]); - printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p1 -1],a[j2+p1 ], b[j2+p1 -1],b[j2+p1 ]); - printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p2 -1],a[j2+p2 ], b[j2+p2 -1],b[j2+p2 ]); - printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p3 -1],a[j2+p3 ], b[j2+p3 -1],b[j2+p3 ]); - printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p4 -1],a[j2+p4 ], b[j2+p4 -1],b[j2+p4 ]); - printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p5 -1],a[j2+p5 ], b[j2+p5 -1],b[j2+p5 ]); - printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p6 -1],a[j2+p6 ], b[j2+p6 -1],b[j2+p6 ]); - printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p7 -1],a[j2+p7 ], b[j2+p7 -1],b[j2+p7 ]); - printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p8 -1],a[j2+p8 ], b[j2+p8 -1],b[j2+p8 ]); - printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p9 -1],a[j2+p9 ], b[j2+p9 -1],b[j2+p9 ]); - printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p10-1],a[j2+p10], b[j2+p10-1],b[j2+p10]); - printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p11-1],a[j2+p11], b[j2+p11-1],b[j2+p11]); - printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p12-1],a[j2+p12], b[j2+p12-1],b[j2+p12]); - printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p13-1],a[j2+p13], b[j2+p13-1],b[j2+p13]); - printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p14-1],a[j2+p14], b[j2+p14-1],b[j2+p14]); - printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a[j2+p15-1],a[j2+p15], b[j2+p15-1],b[j2+p15]); + printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2 -1],a[j2 ], b[j2 -1],b[j2 ]); + printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p1 -1],a[j2+p1 ], b[j2+p1 -1],b[j2+p1 ]); + printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p2 -1],a[j2+p2 ], b[j2+p2 -1],b[j2+p2 ]); + printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p3 -1],a[j2+p3 ], b[j2+p3 -1],b[j2+p3 ]); + printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p4 -1],a[j2+p4 ], b[j2+p4 -1],b[j2+p4 ]); + printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p5 -1],a[j2+p5 ], b[j2+p5 -1],b[j2+p5 ]); + printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p6 -1],a[j2+p6 ], b[j2+p6 -1],b[j2+p6 ]); + printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p7 -1],a[j2+p7 ], b[j2+p7 -1],b[j2+p7 ]); + printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p8 -1],a[j2+p8 ], b[j2+p8 -1],b[j2+p8 ]); + printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p9 -1],a[j2+p9 ], b[j2+p9 -1],b[j2+p9 ]); + printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p10-1],a[j2+p10], b[j2+p10-1],b[j2+p10]); + printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p11-1],a[j2+p11], b[j2+p11-1],b[j2+p11]); + printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p12-1],a[j2+p12], b[j2+p12-1],b[j2+p12]); + printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p13-1],a[j2+p13], b[j2+p13-1],b[j2+p13]); + printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p14-1],a[j2+p14], b[j2+p14-1],b[j2+p14]); + printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j2+p15-1],a[j2+p15], b[j2+p15-1],b[j2+p15]); } #endif } @@ -2392,7 +2392,7 @@ void radix16_dif_pass1 (double a[], int n) ===============*/ /*...Block 1: t1,9,17,25 */ jt = j1; jp = j2; - /* Debug: check for overflow of + terms: */ ASSERT(HERE, m1+m9 >= m1 && m2+m10 >= m2,"Overflow of [0,8b] term!"); + /* Debug: check for overflow of + terms: */ ASSERT(m1+m9 >= m1 && m2+m10 >= m2,"Overflow of [0,8b] term!"); rt =t9; t9 =t1 -rt; t1 =t1 +rt; rm =m9; m9 =qreduce(m1 -rm+q4); m1 =qreduce(m1 +rm ); // 1, 2 in 0,8b -> 0,b it =t10;t10=t2 -it; t2 =t2 +it; im =m10;m10=qreduce(m2 -im+q4); m2 =qreduce(m2 +im+q4); // 9,10 in -4b,4b -> 0,b @@ -2784,10 +2784,10 @@ void radix16_dit_pass1 (double a[], int n) /*...Block 1: t1,9,17,25 */ /* printf("Block 1 float/int inputs:\n"); -printf("1 ,2 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t1 ,t2 , m1 ,m2 , q-qreduce_full(m1 ),q-qreduce_full(m2 )); -printf("9 ,10 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t9 ,t10, m9 ,m10, q-qreduce_full(m9 ),q-qreduce_full(m10)); -printf("17,18 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t17,t18, m17,m18, q-qreduce_full(m17),q-qreduce_full(m18)); -printf("25,26 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t25,t26, m25,m26, q-qreduce_full(m25),q-qreduce_full(m26)); +printf("1 ,2 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t1 ,t2 , m1 ,m2 , q-qreduce_full(m1 ),q-qreduce_full(m2 )); +printf("9 ,10 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t9 ,t10, m9 ,m10, q-qreduce_full(m9 ),q-qreduce_full(m10)); +printf("17,18 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t17,t18, m17,m18, q-qreduce_full(m17),q-qreduce_full(m18)); +printf("25,26 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t25,t26, m25,m26, q-qreduce_full(m25),q-qreduce_full(m26)); */ rt =t9 ; t9 =t1 -rt; t1 =t1 +rt; rm =m9 ; m9 =qreduce(m1 -rm+q4); m1 =qreduce(m1 +rm); // +: 0,8b -> 0,b it =t10; t10=t2 -it; t2 =t2 +it; im =m10; m10=qreduce(m2 -im+q4); m2 =qreduce(m2 +im); // -: -4b,4b -> 0,b @@ -2804,10 +2804,10 @@ printf("25,26 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t /*...Block 3: t5,13,21,29 */ /* printf("Block 3 float/int inputs:\n"); -printf("5 ,6 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t5 ,t6 , m5 ,m6 , q-qreduce_full(m5 ),q-qreduce_full(m6 )); -printf("13,14 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t13,t14, m13,m14, q-qreduce_full(m13),q-qreduce_full(m14)); -printf("21,22 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t21,t22, m21,m22, q-qreduce_full(m21),q-qreduce_full(m22)); -printf("29,30 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t29,t30, m29,m30, q-qreduce_full(m29),q-qreduce_full(m30)); +printf("5 ,6 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t5 ,t6 , m5 ,m6 , q-qreduce_full(m5 ),q-qreduce_full(m6 )); +printf("13,14 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t13,t14, m13,m14, q-qreduce_full(m13),q-qreduce_full(m14)); +printf("21,22 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t21,t22, m21,m22, q-qreduce_full(m21),q-qreduce_full(m22)); +printf("29,30 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t29,t30, m29,m30, q-qreduce_full(m29),q-qreduce_full(m30)); */ rt =t13; t13=t5 -t14; t5 =t5 +t14; rm =m13;m13=qreduce(m5-m14+q4); m5 =qreduce(m5 +m14+q4); // all 4 outs in -4b,4b; t14=t6 +rt; t6 =t6 -rt; m14=qreduce(m6+rm +q4); m6 =qreduce(m6 -rm +q4); // reduce all 4 to 0,b. @@ -2826,10 +2826,10 @@ t21=rt; rt =(t29-t30)*ISRT2;it =(t29+t30)*ISRT2; rm = mul_i2(m29-m30+q4); im = m /*...Block 2: t3,11,19,27 */ /* printf("Block 2 float/int inputs:\n"); -printf("3 ,4 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t3 ,t4 , m3 ,m4 , q-qreduce_full(m3 ),q-qreduce_full(m4 )); -printf("11,12 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t11,t12, m11,m12, q-qreduce_full(m11),q-qreduce_full(m12)); -printf("19,20 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t19,t20, m19,m20, q-qreduce_full(m19),q-qreduce_full(m20)); -printf("27,28 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t27,t28, m27,m28, q-qreduce_full(m27),q-qreduce_full(m28)); +printf("3 ,4 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t3 ,t4 , m3 ,m4 , q-qreduce_full(m3 ),q-qreduce_full(m4 )); +printf("11,12 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t11,t12, m11,m12, q-qreduce_full(m11),q-qreduce_full(m12)); +printf("19,20 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t19,t20, m19,m20, q-qreduce_full(m19),q-qreduce_full(m20)); +printf("27,28 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t27,t28, m27,m28, q-qreduce_full(m27),q-qreduce_full(m28)); */ rt =(t12+t11)*ISRT2;it =(t12-t11)*ISRT2; rm = mul_i2(m12+m11+q4);im = mul_i2(m12-m11+q4); // 0,b30 t11 = t3 -rt; t3 = t3 +rt; m11 = m3 -rm; m3 = m3 +rm; // 3, 4 in -2b,2b+b30 @@ -2856,10 +2856,10 @@ t19=rt; rt =t27*s + t28*c; it =t28*s - t27*c; cmul_modq8(m27,m28, sm,q8-cm, &rm /*...Block 4: t7,15,23,31 */ /* printf("Block 4 float/int inputs:\n"); -printf(" 7, 8 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t7 ,t8 , m7 ,m8 , q-qreduce_full(m7 ),q-qreduce_full(m8 )); -printf("15,16 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t15,t16, m15,m16, q-qreduce_full(m15),q-qreduce_full(m16)); -printf("23,24 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t23,t24, m23,m24, q-qreduce_full(m23),q-qreduce_full(m24)); -printf("31,32 float = [%10.5f,%10.5f]; int = [%llu,%llu]; neg = [%llu,%llu]\n",t31,t32, m31,m32, q-qreduce_full(m31),q-qreduce_full(m32)); +printf(" 7, 8 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t7 ,t8 , m7 ,m8 , q-qreduce_full(m7 ),q-qreduce_full(m8 )); +printf("15,16 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t15,t16, m15,m16, q-qreduce_full(m15),q-qreduce_full(m16)); +printf("23,24 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t23,t24, m23,m24, q-qreduce_full(m23),q-qreduce_full(m24)); +printf("31,32 float = [%10.5f,%10.5f]; int = [%" PRIu64 ",%" PRIu64 "]; neg = [%" PRIu64 ",%" PRIu64 "]\n",t31,t32, m31,m32, q-qreduce_full(m31),q-qreduce_full(m32)); exit(0); */ rt =(t15-t16)*ISRT2;it =(t15+t16)*ISRT2; rm = mul_i2(m15-m16+q4);im = mul_i2(m15+m16+q4); // 0,b30 @@ -3143,8 +3143,8 @@ t23=rt; rt =t31*c + t32*s; it =t32*c - t31*s; cmul_modq8(m31,m32, cm,q8-sm, &rm double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -3251,9 +3251,9 @@ t23=rt; rt =t31*c + t32*s; it =t32*c - t31*s; cmul_modq8(m31,m32, cm,q8-sm, &rm half_arr= tmp + 0x12; /* This table needs 20x16 bytes */ #endif - ASSERT(HERE, (isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!"); + ASSERT((isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -3263,18 +3263,18 @@ t23=rt; rt =t31*c + t32*s; it =t32*c - t31*s; cmul_modq8(m31,m32, cm,q8-sm, &rm /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { #ifdef USE_AVX512 /* No-Op */ #else - dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } @@ -3742,8 +3742,8 @@ t23=rt; rt =t31*c + t32*s; it =t32*c - t31*s; cmul_modq8(m31,m32, cm,q8-sm, &rm /* Now, finally can update fx and cy: */\ *cy = DNINT(temp*baseinv[i]); check_nint(*cy, temp*baseinv[i]);/*@*/ *fx = (temp-*cy * base[i])*wt;/*@*/ - ASSERT(HERE, *fx == (double)rm * wt, "Bad mod-Xout!"); /* put rm into double-version and forward weight *//*@*/ - ASSERT(HERE, itmp == *cy, "Bad mod-carry!"); + ASSERT(*fx == (double)rm * wt, "Bad mod-Xout!"); /* put rm into double-version and forward weight *//*@*/ + ASSERT(itmp == *cy, "Bad mod-carry!"); /*========================*//*@*/ bjmodn = (bjmodn + bw) & nm1;/*@*/ wt =wtlp1*wtA;/*@*/ diff --git a/src/radix16_dyadic_square.c b/src/radix16_dyadic_square.c index ef0572e0..38422dee 100755 --- a/src/radix16_dyadic_square.c +++ b/src/radix16_dyadic_square.c @@ -154,7 +154,7 @@ void radix16_dyadic_square( b = (double *)(fwd_fft_only & ~0xCull); // BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0: if(fwd_fft_only & 0xC) { - ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry + ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry fwd_fft_only = 3ull; } } @@ -172,9 +172,9 @@ void radix16_dyadic_square( /**************************************************************************************************************************************/ if((rad0save != radix0) || (nsave != n)) { - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); nsave = n; - ASSERT(HERE, N2 == n/2, "N2 bad!"); + ASSERT(N2 == n/2, "N2 bad!"); rad0save = radix0; ndivrad0 = n/radix0; for(j = 0; j < ndivrad0; j += stride) @@ -182,7 +182,7 @@ void radix16_dyadic_square( j1 = j + ( (j >> DAT_BITS) << PAD_BITS ); if( (j1+stridh) != (j+stridh) + ( ((j+stridh) >> DAT_BITS) << PAD_BITS ) ) { printf("j, j1, stride/2 = %d,%d,%d, jpad = %d\n",j,j1, stridh, (j+stridh) + (((j+stridh) >> DAT_BITS) << PAD_BITS) ); - ASSERT(HERE, 0 , "add1 calculation violates padded index rules!"); + ASSERT(0 , "add1 calculation violates padded index rules!"); } } if(index_ptmp0) { @@ -195,7 +195,7 @@ void radix16_dyadic_square( index_ptmp = ALLOC_INT(N2/16); index = ALIGN_INT(index_ptmp); - if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } for(i=0; i < N2/16; i++) { index[i]=i; @@ -205,11 +205,11 @@ void radix16_dyadic_square( index1_mod = (n>>5)/radix0; /* complex length requires an additional divide by 2 */ index_ptmp0 = ALLOC_INT(index_ptmp0, index0_mod); - if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } index0 = ALIGN_INT(index_ptmp0); index_ptmp1 = ALLOC_INT(index_ptmp1, index1_mod); - if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } index1 = ALIGN_INT(index_ptmp1); for(i=0; i < index0_mod; i++){index0[i]= i;} @@ -228,7 +228,7 @@ void radix16_dyadic_square( if(i == radix0) break; } - if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } bit_reverse_int(index0, index0_mod, nradices_prim_radix0, &radix_prim[nradices_prim_radix0-1], -1,(int *)arr_scratch); bit_reverse_int(index1, index1_mod, nradices_prim-4-nradices_prim_radix0, &radix_prim[nradices_prim -5], -1,(int *)arr_scratch); @@ -243,10 +243,10 @@ void radix16_dyadic_square( if(init_sse2 <= max_threads) // current alloc sufficient return; - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif #ifdef USE_SSE2 @@ -256,13 +256,13 @@ void radix16_dyadic_square( free((void *)sc_arr); sc_arr=0x0; } // Index vectors used in SIMD roots-computation. - sm_arr = ALLOC_INT(sm_arr, max_threads*10*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sm_arr = ALLOC_INT(sm_arr, max_threads*10*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sm_ptr = ALIGN_INT(sm_arr); - ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); // Twiddles-array: - sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads + 100); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 72*max_threads + 100); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 32 16-byte slots of sc_arr for temporaries, next 3 for the nontrivial complex 16th roots, last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array. @@ -328,7 +328,7 @@ void radix16_dyadic_square( /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); #ifdef USE_SSE2 k1_arr = __i0 + thr_id*10*RE_IM_STRIDE; k2_arr = k1_arr + 5*RE_IM_STRIDE; @@ -357,9 +357,9 @@ void radix16_dyadic_square( #endif /*...If a new runlength, should not get to this point: */ - ASSERT(HERE, n == nsave,"n != nsave"); - ASSERT(HERE, incr == 32,"incr != 32"); - ASSERT(HERE, ndivrad0 == n/radix0,"bad value for ndivrad0!"); + ASSERT(n == nsave,"n != nsave"); + ASSERT(incr == 32,"incr != 32"); + ASSERT(ndivrad0 == n/radix0,"bad value for ndivrad0!"); /* k = ii*(ndivrad0 >> 5); */ diff --git a/src/radix16_main_carry_loop.h b/src/radix16_main_carry_loop.h index 26609938..61749fcf 100755 --- a/src/radix16_main_carry_loop.h +++ b/src/radix16_main_carry_loop.h @@ -800,22 +800,22 @@ t23=rt; rt =t31*c + t32*s; it =t32*c - t31*s; cmul_modq8(m31,m32, cm,q8-sm, &rm if(!j) { printf("J = 0, carry-step INputs:\n"); - printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a1p0r,a1p0i, b1p0r,b1p0i); - printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a1p1r,a1p1i, b1p1r,b1p1i); - printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a1p2r,a1p2i, b1p2r,b1p2i); - printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a1p3r,a1p3i, b1p3r,b1p3i); - printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a1p4r,a1p4i, b1p4r,b1p4i); - printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a1p5r,a1p5i, b1p5r,b1p5i); - printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a1p6r,a1p6i, b1p6r,b1p6i); - printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a1p7r,a1p7i, b1p7r,b1p7i); - printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a1p8r,a1p8i, b1p8r,b1p8i); - printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a1p9r,a1p9i, b1p9r,b1p9i); - printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a1pAr,a1pAi, b1pAr,b1pAi); - printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a1pBr,a1pBi, b1pBr,b1pBi); - printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a1pCr,a1pCi, b1pCr,b1pCi); - printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a1pDr,a1pDi, b1pDr,b1pDi); - printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a1pEr,a1pEi, b1pEr,b1pEi); - printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a1pFr,a1pFi, b1pFr,b1pFi); + printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p0r,a1p0i, b1p0r,b1p0i); + printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p1r,a1p1i, b1p1r,b1p1i); + printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p2r,a1p2i, b1p2r,b1p2i); + printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p3r,a1p3i, b1p3r,b1p3i); + printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p4r,a1p4i, b1p4r,b1p4i); + printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p5r,a1p5i, b1p5r,b1p5i); + printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p6r,a1p6i, b1p6r,b1p6i); + printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p7r,a1p7i, b1p7r,b1p7i); + printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p8r,a1p8i, b1p8r,b1p8i); + printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p9r,a1p9i, b1p9r,b1p9i); + printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pAr,a1pAi, b1pAr,b1pAi); + printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pBr,a1pBi, b1pBr,b1pBi); + printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pCr,a1pCi, b1pCr,b1pCi); + printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pDr,a1pDi, b1pDr,b1pDi); + printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pEr,a1pEi, b1pEr,b1pEi); + printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pFr,a1pFi, b1pFr,b1pFi); } if(!j) { if(full_pass)printf("\n"); @@ -864,22 +864,22 @@ if(!j) */ if(!j) { printf("J = 0, carry-step OUTputs:\n"); - printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a1p0r,a1p0i, b1p0r,b1p0i); - printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a1p1r,a1p1i, b1p1r,b1p1i); - printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a1p2r,a1p2i, b1p2r,b1p2i); - printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a1p3r,a1p3i, b1p3r,b1p3i); - printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a1p4r,a1p4i, b1p4r,b1p4i); - printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a1p5r,a1p5i, b1p5r,b1p5i); - printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a1p6r,a1p6i, b1p6r,b1p6i); - printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a1p7r,a1p7i, b1p7r,b1p7i); - printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a1p8r,a1p8i, b1p8r,b1p8i); - printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a1p9r,a1p9i, b1p9r,b1p9i); - printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a1pAr,a1pAi, b1pAr,b1pAi); - printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a1pBr,a1pBi, b1pBr,b1pBi); - printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a1pCr,a1pCi, b1pCr,b1pCi); - printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a1pDr,a1pDi, b1pDr,b1pDi); - printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a1pEr,a1pEi, b1pEr,b1pEi); - printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a1pFr,a1pFi, b1pFr,b1pFi); + printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p0r,a1p0i, b1p0r,b1p0i); + printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p1r,a1p1i, b1p1r,b1p1i); + printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p2r,a1p2i, b1p2r,b1p2i); + printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p3r,a1p3i, b1p3r,b1p3i); + printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p4r,a1p4i, b1p4r,b1p4i); + printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p5r,a1p5i, b1p5r,b1p5i); + printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p6r,a1p6i, b1p6r,b1p6i); + printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p7r,a1p7i, b1p7r,b1p7i); + printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p8r,a1p8i, b1p8r,b1p8i); + printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1p9r,a1p9i, b1p9r,b1p9i); + printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pAr,a1pAi, b1pAr,b1pAi); + printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pBr,a1pBi, b1pBr,b1pBi); + printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pCr,a1pCi, b1pCr,b1pCi); + printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pDr,a1pDi, b1pDr,b1pDi); + printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pEr,a1pEi, b1pEr,b1pEi); + printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a1pFr,a1pFi, b1pFr,b1pFi); printf("\niter %2u [full-pass = %u]: a01 OUT: %20.10e, %20.10e, cy = %20.10e\n",iter,full_pass,a1p0r,a1p0i,cy_r0); } @@ -1309,7 +1309,7 @@ if(!j) { ===============*/ /*...Block 1: t1,9,17,25 */ jt = j1; jp = j2; - /* Debug: check for overflow of + terms: */ ASSERT(HERE, m1+m9 >= m1 && m$+m10 >= m$,"Overflow of [0,8b] term!"); + /* Debug: check for overflow of + terms: */ ASSERT(m1+m9 >= m1 && m$+m10 >= m$,"Overflow of [0,8b] term!"); rt =t9; t9 =t1 -rt; t1 =t1 +rt; rm =m9; m9 =qreduce(m1 -rm+q4); m1 =qreduce(m1 +rm ); // +: 0,8b -> 0,b it =t10;t10=t2 -it; t2 =t2 +it; im =m10;m10=qreduce(m$ -im+q4); m$ =qreduce(m$ +im ); // -: -4b,4b -> 0,b @@ -1376,22 +1376,22 @@ t23=rt; rt =t31*c - t32*s; it =t32*c + t31*s; cmul_modq8(m31,m32, cm,sm, &rm, if(!j) { printf("J = 0, DIF1 OUTputs:\n"); - printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1 ],a[j1 +1], b[j1 ],b[j1 +1]); - printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p1 ],a[j1+p1 +1], b[j1+p1 ],b[j1+p1 +1]); - printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p2 ],a[j1+p2 +1], b[j1+p2 ],b[j1+p2 +1]); - printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p3 ],a[j1+p3 +1], b[j1+p3 ],b[j1+p3 +1]); - printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p4 ],a[j1+p4 +1], b[j1+p4 ],b[j1+p4 +1]); - printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p5 ],a[j1+p5 +1], b[j1+p5 ],b[j1+p5 +1]); - printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p6 ],a[j1+p6 +1], b[j1+p6 ],b[j1+p6 +1]); - printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p7 ],a[j1+p7 +1], b[j1+p7 ],b[j1+p7 +1]); - printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p8 ],a[j1+p8 +1], b[j1+p8 ],b[j1+p8 +1]); - printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p9 ],a[j1+p9 +1], b[j1+p9 ],b[j1+p9 +1]); - printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p10],a[j1+p10+1], b[j1+p10],b[j1+p10+1]); - printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p11],a[j1+p11+1], b[j1+p11],b[j1+p11+1]); - printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p12],a[j1+p12+1], b[j1+p12],b[j1+p12+1]); - printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p13],a[j1+p13+1], b[j1+p13],b[j1+p13+1]); - printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p14],a[j1+p14+1], b[j1+p14],b[j1+p14+1]); - printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20llu, %20llu\n",a[j1+p15],a[j1+p15+1], b[j1+p15],b[j1+p15+1]); + printf("a1p0r,a1p0i, b1p0r,b1p0i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1 ],a[j1 +1], b[j1 ],b[j1 +1]); + printf("a1p1r,a1p1i, b1p1r,b1p1i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p1 ],a[j1+p1 +1], b[j1+p1 ],b[j1+p1 +1]); + printf("a1p2r,a1p2i, b1p2r,b1p2i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p2 ],a[j1+p2 +1], b[j1+p2 ],b[j1+p2 +1]); + printf("a1p3r,a1p3i, b1p3r,b1p3i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p3 ],a[j1+p3 +1], b[j1+p3 ],b[j1+p3 +1]); + printf("a1p4r,a1p4i, b1p4r,b1p4i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p4 ],a[j1+p4 +1], b[j1+p4 ],b[j1+p4 +1]); + printf("a1p5r,a1p5i, b1p5r,b1p5i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p5 ],a[j1+p5 +1], b[j1+p5 ],b[j1+p5 +1]); + printf("a1p6r,a1p6i, b1p6r,b1p6i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p6 ],a[j1+p6 +1], b[j1+p6 ],b[j1+p6 +1]); + printf("a1p7r,a1p7i, b1p7r,b1p7i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p7 ],a[j1+p7 +1], b[j1+p7 ],b[j1+p7 +1]); + printf("a1p8r,a1p8i, b1p8r,b1p8i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p8 ],a[j1+p8 +1], b[j1+p8 ],b[j1+p8 +1]); + printf("a1p9r,a1p9i, b1p9r,b1p9i = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p9 ],a[j1+p9 +1], b[j1+p9 ],b[j1+p9 +1]); + printf("a1pAr,a1pAi, b1pAr,b1pAi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p10],a[j1+p10+1], b[j1+p10],b[j1+p10+1]); + printf("a1pBr,a1pBi, b1pBr,b1pBi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p11],a[j1+p11+1], b[j1+p11],b[j1+p11+1]); + printf("a1pCr,a1pCi, b1pCr,b1pCi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p12],a[j1+p12+1], b[j1+p12],b[j1+p12+1]); + printf("a1pDr,a1pDi, b1pDr,b1pDi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p13],a[j1+p13+1], b[j1+p13],b[j1+p13+1]); + printf("a1pEr,a1pEi, b1pEr,b1pEi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p14],a[j1+p14+1], b[j1+p14],b[j1+p14+1]); + printf("a1pFr,a1pFi, b1pFr,b1pFi = %20.10e, %20.10e, %20" PRIu64 ", %20" PRIu64 "\n",a[j1+p15],a[j1+p15+1], b[j1+p15],b[j1+p15+1]); } /**********************************************/ #else // USE_FGT61 = False; Basic scalar-double mode: diff --git a/src/radix16_pairFFT_mul.c b/src/radix16_pairFFT_mul.c index bbe821db..67793baa 100755 --- a/src/radix16_pairFFT_mul.c +++ b/src/radix16_pairFFT_mul.c @@ -247,7 +247,7 @@ void radix16_pairFFT_mul( if(INIT_ARRAYS) { nsave = n; - ASSERT(HERE, N2 == n/2, "N2 bad!"); + ASSERT(N2 == n/2, "N2 bad!"); #if SYMM == 2 // Use complex-plane symmetries to reduce fraction of rt1 array actually needed nh = n/(NRT<<2); // #rt1 elts in each quadrant @@ -282,7 +282,7 @@ void radix16_pairFFT_mul( free((void *)index_ptmp); index_ptmp=0x0; } index_ptmp = ALLOC_INT(index_ptmp, N2/16); - ASSERT(HERE, index_ptmp != 0,"ERROR: unable to allocate array INDEX!"); + ASSERT(index_ptmp != 0,"ERROR: unable to allocate array INDEX!"); index = ALIGN_INT(index_ptmp); /* !...Now rearrange FFT sincos indices using the main loop structure as a template. @@ -321,7 +321,7 @@ void radix16_pairFFT_mul( if(j2_start == n-32)break; blocklen_sum = blocklen_sum + blocklen; - ASSERT(HERE, i != 0,"ERROR 10!"); + ASSERT(i != 0,"ERROR 10!"); blocklen = (radix_prim[i-1]-1)*blocklen_sum; j2_start = j2_start+(blocklen<<2); @@ -342,14 +342,14 @@ void radix16_pairFFT_mul( /*...If a new runlength, should not get to this point: */ if(n != nsave) { sprintf(cbuf,"ERROR: %s: INIT_ARRAYS not invoked for new runlength!",func); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* If precomputing a forward FFT of a set of inputs, make sure they're in the uv-vector and the abcd-multiplier vectors are null: */ if(FORWARD_FFT_ONLY == 1 && (ab_mul != 0x0 || cd_mul != 0x0)) { sprintf(cbuf,"%s: FORWARD_FFT_ONLY = TRUE but non-null abcd-multiplier vectors!",func); - ASSERT(HERE, 0,cbuf); + ASSERT(0,cbuf); } /* Init the loop-control variables: */ @@ -1179,7 +1179,7 @@ for(i = nradices_prim-5; i >= 0; i-- ) /* Main loop: lower bound = nradices_prim // Dec 2015: Despite all my efforts, simply not yet able to wring out remaining bug(s) in indexing scheme // here. If and when I do finally get things working, also need to fuse the 2 x PAIR_MUL occurrences on // each line into a working single ABCD_MUL macro, which avoids the work-duplication of the 2 x PAIR_MUL: - ASSERT(HERE, 0, "Linear-combo algorithm not yet working!"); + ASSERT(0, "Linear-combo algorithm not yet working!"); /* Dyadic muls of the forward FFT outputs with the corresponding a/b and c/d-vector data so as to obtain FFT(a*u-b*v, c*u-d*v). u,v in ajp*r,i; a,b in ab_mul[even,odd]; c,d in cd_mul[even,odd]: diff --git a/src/radix16_wrapper_ini.c b/src/radix16_wrapper_ini.c index b37fd430..31285043 100755 --- a/src/radix16_wrapper_ini.c +++ b/src/radix16_wrapper_ini.c @@ -75,7 +75,7 @@ void radix16_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int r ws_m [iblock_next] = m ; ws_blocklen [iblock_next] = blocklen ; ws_blocklen_sum[iblock_next] = blocklen_sum; - // printf("%8llu %20llu %8llu: init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k); + // printf("%8" PRIu64 " %20" PRIu64 " %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k); return; } jump_in: // Entry point for all blocks but the first. diff --git a/src/radix16_wrapper_square.c b/src/radix16_wrapper_square.c index 296b3819..ad9f042b 100755 --- a/src/radix16_wrapper_square.c +++ b/src/radix16_wrapper_square.c @@ -201,7 +201,7 @@ The scratch array (2nd input argument) is only needed for data table initializat b = (double *)(fwd_fft_only & ~0xCull); // BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0: if(fwd_fft_only & 0xC) { - ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry + ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry fwd_fft_only = 3ull; } } @@ -220,10 +220,10 @@ The scratch array (2nd input argument) is only needed for data table initializat nsave = n; if(init_sse2 > max_threads) // current SIMD local-alloc insufficient { - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif #ifdef USE_SSE2 @@ -235,14 +235,14 @@ The scratch array (2nd input argument) is only needed for data table initializat // Index vectors used in SIMD roots-computation. // The AVX512 compute-sincos-mults code needs 2 elements per complex-double-load, so use 10*RE_IM_STRIDE per array // to alloc storage here for all cases, even though that leaves upper array halves unused for sub-AVX512. - sm_arr = ALLOC_INT(sm_arr, max_threads*20*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sm_arr = ALLOC_INT(sm_arr, max_threads*20*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sm_ptr = ALIGN_INT(sm_arr); - ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); // Twiddles-array: Need 0x47 slots for data, plus need to leave room to pad-align. // v20: To support inline a*(b-c) for p-1 stage 2, need 2*RADIX = 32 added vec_dbl, thus 0x4c ==> 0x6c: - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x6c*max_threads); ASSERT(HERE, sc_arr != 0,"ERROR: unable to allocate sc_arr!"); + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x6c*max_threads); ASSERT(sc_arr != 0,"ERROR: unable to allocate sc_arr!"); sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 32 16-byte slots of sc_arr for temporaries, next 4 for const = 1/4 and nontrivial complex 16th roots, last 30 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array: */ #ifdef MULTITHREAD @@ -392,12 +392,12 @@ The scratch array (2nd input argument) is only needed for data table initializat free((void *)twidl_ptmp); twidl_ptmp = 0x0; #endif } - index_ptmp = ALLOC_INT(index_ptmp, N2/16); ASSERT(HERE, index_ptmp != 0,"ERROR: unable to allocate array INDEX!"); + index_ptmp = ALLOC_INT(index_ptmp, N2/16); ASSERT(index_ptmp != 0,"ERROR: unable to allocate array INDEX!"); index = ALIGN_INT(index_ptmp); #ifdef USE_PRECOMPUTED_TWIDDLES printf("%s: Alloc precomputed-twiddles array with %u Kdoubles.\n",func,N2*15/8); - twidl_ptmp = ALLOC_COMPLEX(twidl_ptmp, N2*15/16); ASSERT(HERE, twidl_ptmp != 0,"ERROR: unable to allocate twidl_ptmp!"); - twidl = ALIGN_COMPLEX(twidl_ptmp); ASSERT(HERE, ((long)twidl & 0x3f) == 0, "twidl-array not 64-byte aligned!"); + twidl_ptmp = ALLOC_COMPLEX(twidl_ptmp, N2*15/16); ASSERT(twidl_ptmp != 0,"ERROR: unable to allocate twidl_ptmp!"); + twidl = ALIGN_COMPLEX(twidl_ptmp); ASSERT(((long)twidl & 0x3f) == 0, "twidl-array not 64-byte aligned!"); #endif /* !...Now rearrange FFT sincos indices using the main loop structure as a template. @@ -427,7 +427,7 @@ The scratch array (2nd input argument) is only needed for data table initializat k1 = k1 + (blocklen >> 1); if(j2_start == n-32)break; blocklen_sum = blocklen_sum + blocklen; - ASSERT(HERE, i != 0,"ERROR 10!"); + ASSERT(i != 0,"ERROR 10!"); blocklen = (radix_prim[i-1]-1)*blocklen_sum; j2_start = j2_start+(blocklen<<2); } @@ -1066,7 +1066,7 @@ The scratch array (2nd input argument) is only needed for data table initializat /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); #ifdef USE_SSE2 k1_arr = __i0 + thr_id*20*RE_IM_STRIDE; k2_arr = k1_arr + 10*RE_IM_STRIDE; @@ -1098,7 +1098,7 @@ The scratch array (2nd input argument) is only needed for data table initializat #endif #endif /*...If a new runlength, should not get to this point: */ - ASSERT(HERE, n == nsave,"n != nsave"); + ASSERT(n == nsave,"n != nsave"); /* ! SOLVING THE CACHE FLOW PROBLEM FOR BIT-REVERSED ARRAY DATA: diff --git a/src/radix176_ditN_cy_dif1.c b/src/radix176_ditN_cy_dif1.c index efa7da0f..0a32d5f5 100755 --- a/src/radix176_ditN_cy_dif1.c +++ b/src/radix176_ditN_cy_dif1.c @@ -368,7 +368,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -438,11 +438,11 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -480,7 +480,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -490,7 +490,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -530,24 +530,24 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix176_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix176_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix176_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -1099,12 +1099,12 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1128,7 +1128,7 @@ int radix176_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1281,8 +1281,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1292,8 +1292,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1302,26 +1302,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1451,7 +1451,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy176_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy176_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1461,7 +1461,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2210,8 +2210,8 @@ void radix176_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2341,21 +2341,21 @@ void radix176_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix192_ditN_cy_dif1.c b/src/radix192_ditN_cy_dif1.c index ac366f69..acedd916 100755 --- a/src/radix192_ditN_cy_dif1.c +++ b/src/radix192_ditN_cy_dif1.c @@ -306,7 +306,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -364,7 +364,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] i = 1; #endif if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) { // Only care about this divisibility property for LOACC carry modes: - ASSERT(HERE, 0 == ((RADIX/i) % incr),"Carry-chain wts-multipliers recurrence length must divide RADIX/[n-wayness of carry macro]!"); + ASSERT(0 == ((RADIX/i) % incr),"Carry-chain wts-multipliers recurrence length must divide RADIX/[n-wayness of carry macro]!"); } // For n a power of 2 don't need to worry about 32-bit integer overflow in the sw*NDIVR term, // but for non-power-of-2 n we must cast-to-uint64 to avoid such overflows fubaring the result: @@ -376,11 +376,11 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -418,7 +418,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -428,7 +428,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -468,24 +468,24 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix192_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix192_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix192_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -515,8 +515,8 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] sse2_rnd= tmp + 0x01; // sc_ptr += 0x364; This is where the value of half_arr_offset192 comes from half_arr= tmp + 0x02; // This table needs 32*SZ_VD bytes in sse2 mode #endif -// ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix192_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix192_creals_in_local_store checksum failed!"); +// ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix192_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix192_creals_in_local_store checksum failed!"); /* Roots of 1 for radix-3 DFTs: cc0 = (cc1+cc2+cc3)/3 - 1; subtract 1 from Nussbaumer's definition in order to ease in-place computation */ VEC_DBL_INIT(cc0, c3m1); @@ -1074,12 +1074,12 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1103,7 +1103,7 @@ int radix192_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1255,8 +1255,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1266,8 +1266,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1276,26 +1276,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1425,7 +1425,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy192_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy192_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1435,7 +1435,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2323,8 +2323,8 @@ void radix192_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2588,21 +2588,21 @@ void radix192_dit_pass1(double a[], int n) sse2_rnd= tmp + 0x01; // sc_ptr += 0x364; This is where the value of half_arr_offset192 comes from half_arr= tmp + 0x02; // This table needs 20*SZ_VD bytes in sse2 mode #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix208_ditN_cy_dif1.c b/src/radix208_ditN_cy_dif1.c index 7dfc3653..fe506677 100755 --- a/src/radix208_ditN_cy_dif1.c +++ b/src/radix208_ditN_cy_dif1.c @@ -336,7 +336,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -404,11 +404,11 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -446,7 +446,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -456,7 +456,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -496,24 +496,24 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix208_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix208_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -548,7 +548,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r sse2_rnd= tmp + 0x01; // sc_ptr += 0x3c4; This is where the value of half_arr_offset208 comes from half_arr= tmp + 0x02; // This table needs 32*SZ_VD bytes in sse2 mode #endif - ASSERT(HERE, (radix208_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!"); + ASSERT((radix208_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix208_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); #if 1 @@ -971,12 +971,12 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1000,7 +1000,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1152,8 +1152,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1163,8 +1163,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1173,26 +1173,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1322,7 +1322,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy208_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy208_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1332,7 +1332,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2017,8 +2017,8 @@ void radix208_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2109,21 +2109,21 @@ void radix208_dit_pass1(double a[], int n) sse2_rnd= tmp + 0x01; // sc_ptr += 0x3c4; This is where the value of half_arr_offset208 comes from half_arr= tmp + 0x02; // This table needs 20*SZ_VD bytes in sse2 mode #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix20_ditN_cy_dif1.c b/src/radix20_ditN_cy_dif1.c index 0cdaaa8e..e10c5ff3 100755 --- a/src/radix20_ditN_cy_dif1.c +++ b/src/radix20_ditN_cy_dif1.c @@ -279,7 +279,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "radix20_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "radix20_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -346,11 +346,11 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -389,7 +389,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -399,7 +399,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -435,18 +435,18 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix20_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix20_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 80 16-byte slots of sc_arr for temporaries, next 5 for the nontrivial complex 16th roots, next 10 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -914,12 +914,12 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy18 = (double *)malloc(j); ptr_prod += (uint32)(_cy18== 0x0); _cy19 = (double *)malloc(j); ptr_prod += (uint32)(_cy19== 0x0); - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix20_ditN_cy_dif1."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix20_ditN_cy_dif1."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/20-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix20_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix20_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -943,7 +943,7 @@ int radix20_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); first_entry=FALSE; } /* endif(first_entry) */ @@ -1113,8 +1113,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1124,8 +1124,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1134,20 +1134,20 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif @@ -1367,7 +1367,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy20_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy20_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1377,7 +1377,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix32_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1447,7 +1447,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,"radix20_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,"radix20_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ _cy00[ithread] = _cy00[ithread-1]; _cy01[ithread] = _cy01[ithread-1]; _cy02[ithread] = _cy02[ithread-1]; @@ -1882,8 +1882,8 @@ void radix20_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -1952,17 +1952,17 @@ void radix20_dit_pass1(double a[], int n) s1p18r = tmp + 0x24; half_arr= tmp + 0x3a; /* This table needs 20x16 bytes */ s1p19r = tmp + 0x26; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); tmp = half_arr; #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix224_ditN_cy_dif1.c b/src/radix224_ditN_cy_dif1.c index 4dce5db4..390fd14d 100755 --- a/src/radix224_ditN_cy_dif1.c +++ b/src/radix224_ditN_cy_dif1.c @@ -459,7 +459,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(first_entry) { - ASSERT(HERE, LO_ADD,"LO_ADD"); + ASSERT(LO_ADD,"LO_ADD"); psave = p; nsave = n; radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)RADIX)); n2inv = qfdbl(qf_rational_quotient((int64)1, (int64)(n/2))); @@ -491,11 +491,11 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -533,7 +533,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -543,7 +543,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -588,24 +588,24 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of radix224_creals_in_local_store vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix224_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix224_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -648,8 +648,8 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] sse2_rnd= tmp + 0x01; // sc_ptr += 0x(396 + e0 + 2) = 0x478; This is where the value of half_arr_offset224 comes from half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset224 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix224_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix224_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset224 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix224_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix224_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); @@ -750,7 +750,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1640,12 +1640,12 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1680,7 +1680,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1731,7 +1731,7 @@ int radix224_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; }; } - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -2147,8 +2147,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -2158,8 +2158,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -2168,19 +2168,19 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2189,11 +2189,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -2203,8 +2203,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -2426,7 +2426,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy224_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy224_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -2436,7 +2436,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -3478,8 +3478,8 @@ void radix224_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -4058,17 +4058,17 @@ void radix224_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); #if defined(USE_AVX2) || defined(USE_ARM_V8_SIMD) // AVX2 (i.e. FMA)means non-Nussbaumer radix-7, uses these sincos constants: - ASSERT(HERE, (ds3->d0 == 0.0 && ds3->d1 == 0.0), "thread-local memcheck failed!"); + ASSERT((ds3->d0 == 0.0 && ds3->d1 == 0.0), "thread-local memcheck failed!"); #else /* SSE2 version assumes LO_ADD = 0, i.e. the low-mul Nussbaumer-style DFT implementation: */ - ASSERT(HERE, (ds3->d0 == sx3 && ds3->d1 == sx3), "thread-local memcheck failed!"); + ASSERT((ds3->d0 == sx3 && ds3->d1 == sx3), "thread-local memcheck failed!"); #endif - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -4077,15 +4077,15 @@ void radix224_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix240_ditN_cy_dif1.c b/src/radix240_ditN_cy_dif1.c index 5b224f28..fde8736a 100755 --- a/src/radix240_ditN_cy_dif1.c +++ b/src/radix240_ditN_cy_dif1.c @@ -477,11 +477,11 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -519,7 +519,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -529,7 +529,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -572,24 +572,24 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix240_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix240_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -662,8 +662,8 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset240 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix240_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix240_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset240 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix240_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix240_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one , 1.0 ); @@ -753,7 +753,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1174,12 +1174,12 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1214,7 +1214,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1265,7 +1265,7 @@ int radix240_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; }; } // printf("wts_idx_incr = %u\n",wts_idx_incr); - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1748,8 +1748,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1759,8 +1759,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1769,19 +1769,19 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -1790,11 +1790,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1804,8 +1804,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -2026,7 +2026,7 @@ for(outer=0; outer <= 1; outer++) /*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/ for(j = 0; j < main_work_units; ++j) { - ASSERT(HERE, 0x0 == cy240_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy240_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -2036,7 +2036,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2202,7 +2202,7 @@ void radix240_dif_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n/RADIX; @@ -2428,7 +2428,7 @@ void radix240_dit_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n/RADIX; @@ -2853,8 +2853,8 @@ void radix240_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2991,10 +2991,10 @@ void radix240_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -3003,15 +3003,15 @@ void radix240_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix240_main_carry_loop.h b/src/radix240_main_carry_loop.h index c1aa87d8..093364cf 100755 --- a/src/radix240_main_carry_loop.h +++ b/src/radix240_main_carry_loop.h @@ -121,7 +121,7 @@ for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */ ke = kd-1; ke += (-(ke < 0))&15; kd = (kd << 5) + jt; ke = (ke << 5) + jt; // printf("15-DFT #%2u: [k0-E]/2 = %u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",l,k0/2,k1/2,k2/2,k3/2,k4/2,k5/2,k6/2,k7/2,k8/2,k9/2,ka/2,kb/2,kc/2,kd/2,ke/2); - // printf("0x0%2X%2X%2X%2X%2X%2X%2X,0x%2X%2X%2X%2X%2X%2X%2X%2X\n",ke/2,kd/2,kc/2,kb/2,ka/2,k9/2,k8/2,k7/2,k6/2,k5/2,k4/2,k3/2,k2/2,k1/2,k0/2); + // printf("0x0%2X%2X%2X%2X%2X%2X%2X,%#2X%2X%2X%2X%2X%2X%2X%2X\n",ke/2,kd/2,kc/2,kb/2,ka/2,k9/2,k8/2,k7/2,k6/2,k5/2,k4/2,k3/2,k2/2,k1/2,k0/2); // Input ptrs: // Output ptrs: va0 = tmp ; vc0 = tm2 + k0; va1 = tmp+0x02; vc1 = tm2 + k1; diff --git a/src/radix24_ditN_cy_dif1.c b/src/radix24_ditN_cy_dif1.c index cf4478ad..08eb441a 100755 --- a/src/radix24_ditN_cy_dif1.c +++ b/src/radix24_ditN_cy_dif1.c @@ -299,7 +299,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "radix24_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "radix24_ditN_cy_dif1: Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -361,11 +361,11 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -404,7 +404,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -414,7 +414,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -451,18 +451,18 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix24_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix24_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 48 16-byte slots of sc_arr for temporaries, next 2 for the doubled cos and c3m1 terms, next 12 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -855,7 +855,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], p07 = p06 + p01; p08 = p07 + p01; p16 = p08 + p08; - ASSERT(HERE, p16 == p08+p08, "p16 != p08+p08; radix24 ASM macro requires this!"); + ASSERT(p16 == p08+p08, "p16 != p08+p08; radix24 ASM macro requires this!"); p01 = p01 + ( (p01 >> DAT_BITS) << PAD_BITS ); p02 = p02 + ( (p02 >> DAT_BITS) << PAD_BITS ); p03 = p03 + ( (p03 >> DAT_BITS) << PAD_BITS ); @@ -992,12 +992,12 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy_22 = (double *)malloc(j); ptr_prod += (uint32)(_cy_22== 0x0); _cy_23 = (double *)malloc(j); ptr_prod += (uint32)(_cy_23== 0x0); - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix24_ditN_cy_dif1."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays in radix24_ditN_cy_dif1."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/24-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix24_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix24_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1021,7 +1021,7 @@ int radix24_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); first_entry=FALSE; } /* endif(first_entry) */ @@ -1198,8 +1198,8 @@ for(outer=0; outer <= 1; outer++) for(ithread = 0; ithread < CY_THREADS; ithread++) { // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1209,8 +1209,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1219,26 +1219,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].s1p00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].s1p00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif tdat[ithread].bjmodn00 = _bjmodn00[ithread]; @@ -1507,7 +1507,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy24_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy24_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1517,7 +1517,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix32_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1595,7 +1595,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,"radix24_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,"radix24_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ _cy_00[ithread] = _cy_00[ithread-1]; _cy_01[ithread] = _cy_01[ithread-1]; _cy_02[ithread] = _cy_02[ithread-1]; @@ -2018,8 +2018,8 @@ void radix24_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2123,21 +2123,21 @@ void radix24_dit_pass1(double a[], int n) sse2_rnd= s1p00 + 0x41; half_arr= s1p00 + 0x42; /* This table needs 20x16 bytes */ #endif - ASSERT(HERE, (s1p00 == thread_arg->s1p00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((s1p00 == thread_arg->s1p00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix256_ditN_cy_dif1.c b/src/radix256_ditN_cy_dif1.c index c29b3a7c..97026476 100755 --- a/src/radix256_ditN_cy_dif1.c +++ b/src/radix256_ditN_cy_dif1.c @@ -263,7 +263,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex) // to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros: struct complex t[RADIX], *tptr; - double *addr,*addi; + const double *addr,*addi; int *itmp,*itm2; // Pointer into the bjmodn array int err; static int first_entry=TRUE; @@ -414,11 +414,11 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #if !defined(USE_SSE2) && defined(USE_FMA) // Precompute the FMA-modified twiddles for the 2nd-pass radix-16 DFTs: @@ -552,7 +552,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -562,7 +562,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -605,23 +605,23 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 256 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread cslots_in_local_store = radix256_creals_in_local_store + (20+RADIX/2)/2; // Just add enough int64 space for both cases, plus some - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix256_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -665,8 +665,8 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset256 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix256_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix256_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset256 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix256_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix256_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one , 1.0 ); @@ -815,7 +815,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1263,14 +1263,14 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < NDIVR/CY_THREADS; j++) @@ -1290,7 +1290,7 @@ int radix256_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); if(CY_THREADS > 1) { for(ithread = 1; ithread < CY_THREADS; ithread++) @@ -1469,8 +1469,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1480,8 +1480,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1490,20 +1490,20 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; - ASSERT(HERE, ((tmp + 0x400)->d0 == 2.0 && (tmp + 0x400)->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT(((tmp + 0x400)->d0 == 2.0 && (tmp + 0x400)->d1 == 2.0), "thread-local memcheck failed!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif @@ -1513,11 +1513,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1530,8 +1530,8 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_SSE2) // This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots: - dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif // scale gets set immediately prior to calling carry macro, hence no use checking it here. /* init carries */ @@ -1754,7 +1754,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy256_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy256_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1764,7 +1764,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("%s end ; #tasks = %d, #free_tasks = %d\n",func, tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -2276,8 +2276,8 @@ void radix256_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2438,12 +2438,12 @@ void radix256_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); - // ASSERT(HERE, (isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!"); Disable to allow alternate "rounded down" variant of isrt2,sqrt2 + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); + // ASSERT((isrt2->d0 == ISRT2 && isrt2->d1 == ISRT2), "thread-local memcheck failed!"); Disable to allow alternate "rounded down" variant of isrt2,sqrt2 #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2453,18 +2453,18 @@ void radix256_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { #ifdef USE_AVX512 /* No-Op */ #else - dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } diff --git a/src/radix256_main_carry_loop.h b/src/radix256_main_carry_loop.h index f6cc4398..c865409d 100755 --- a/src/radix256_main_carry_loop.h +++ b/src/radix256_main_carry_loop.h @@ -365,8 +365,8 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee // In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass: if(target_idx == j) { #ifdef USE_SSE2 - addr = (double *)s1p00 + target_set; - *addr += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor + double *addr_ = (double *)s1p00 + target_set; + *addr_ += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor #else // target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum: l = target_set&1; target_set >>= 1; @@ -667,26 +667,28 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) { // LOACC with tunable DWT-weights chaining /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy_r; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy_r; for(ntmp = 0; ntmp < RADIX>>2; ntmp++) { jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp]; // poff[] = p04,08,... // Re-init weights every 4th macro invocation to keep errors under control: - cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } else { // HiACC: /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy_r; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy_r; for(ntmp = 0; ntmp < RADIX>>2; ntmp++) { jt = j1 + poff[ntmp]; jp = j2 + poff[ntmp]; // poff[] = p04,08,... - cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_pow2_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p01],a[jp+p01],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p02],a[jp+p02],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_pow2_errcheck (a[jt+p03],a[jp+p03],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } // LOACC or HIACC? @@ -880,13 +882,14 @@ normally be getting dispatched to [radix] separate blocks of the A-array, we nee #else // Scalar-double mode: // Can't use l as loop index here, since it gets used in the Fermat-mod carry macro (as are k1,k2): - ntmp = 0; addr = cy_r; addi = cy_i; + ntmp = 0; + double *addr_ = cy_r, *addi_ = cy_i; for(m = 0; m < RADIX>>2; m++) { jt = j1 + poff[m]; jp = j2 + poff[m]; // poff[] = p04,08,... - fermat_carry_norm_pow2_errcheck(a[jt ],a[jp ],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; - fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr,*addi,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr; ++addi; + fermat_carry_norm_pow2_errcheck(a[jt ],a[jp ],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p01],a[jp+p01],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p02],a[jp+p02],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; + fermat_carry_norm_pow2_errcheck(a[jt+p03],a[jp+p03],*addr_,*addi_,ntmp,NRTM1,NRT_BITS,prp_mult); ntmp += NDIVR; ++addr_; ++addi_; } #endif /* #ifdef USE_SSE2 */ diff --git a/src/radix288_ditN_cy_dif1.c b/src/radix288_ditN_cy_dif1.c index 31909e73..d074803f 100755 --- a/src/radix288_ditN_cy_dif1.c +++ b/src/radix288_ditN_cy_dif1.c @@ -299,7 +299,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -367,11 +367,11 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -409,7 +409,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -419,7 +419,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -459,24 +459,24 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix288_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix288_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix288_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -520,7 +520,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] sse2_rnd= tmp + 0x01; // sc_ptr += 0x(492 + 90 + 2) = 0x524; This is where the value of half_arr_offset288 comes from half_arr= tmp + 0x02; #endif - ASSERT(HERE, (radix288_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix288_creals_in_local_store checksum failed!"); + ASSERT((radix288_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix288_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); VEC_DBL_INIT(sqrt2, SQRT2); VEC_DBL_INIT(isrt2, ISRT2); @@ -1573,12 +1573,12 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1602,7 +1602,7 @@ int radix288_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1755,8 +1755,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1766,8 +1766,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1776,26 +1776,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1928,7 +1928,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy288_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy288_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1938,7 +1938,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -3086,8 +3086,8 @@ void radix288_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -3856,21 +3856,21 @@ void radix288_dit_pass1(double a[], int n) half_arr= tmp + 0x02; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix28_ditN_cy_dif1.c b/src/radix28_ditN_cy_dif1.c index 64c4dc73..08d84010 100755 --- a/src/radix28_ditN_cy_dif1.c +++ b/src/radix28_ditN_cy_dif1.c @@ -518,7 +518,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(first_entry) { - ASSERT(HERE, LO_ADD,"LO_ADD"); + ASSERT(LO_ADD,"LO_ADD"); psave = p; nsave = n; radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)RADIX)); n2inv = qfdbl(qf_rational_quotient((int64)1, (int64)(n/2))); @@ -544,11 +544,11 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -586,7 +586,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -596,7 +596,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -639,24 +639,24 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of radix28_creals_in_local_store vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix28_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix28_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 56 16-byte slots of sc_arr for temporaries, next 8 for the nontrivial complex 16th roots, next 28 for the doubled carry pairs, next 2 for ROE and RND_CONST, next RADIX for the half_arr table lookup stuff, @@ -1154,13 +1154,13 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], poff[0] = 0; poff[1] = p04; poff[2] = p08; poff[3] = p12; poff[4] = p16; poff[5] = p20; poff[6] = p24; - ASSERT(HERE, p01+p01 == p02, "p01+p01 != p02"); - ASSERT(HERE, p02+p02 == p04, "p02+p02 != p04"); - ASSERT(HERE, p04+p04 == p08, "p04+p04 != p08"); - ASSERT(HERE, p08+p04 == p12, "p08+p04 != p12"); - ASSERT(HERE, p12+p04 == p16, "p12+p04 != p16"); - ASSERT(HERE, p16+p04 == p20, "p16+p04 != p20"); - ASSERT(HERE, p20+p04 == p24, "p20+p04 != p24"); + ASSERT(p01+p01 == p02, "p01+p01 != p02"); + ASSERT(p02+p02 == p04, "p02+p02 != p04"); + ASSERT(p04+p04 == p08, "p04+p04 != p08"); + ASSERT(p08+p04 == p12, "p08+p04 != p12"); + ASSERT(p12+p04 == p16, "p12+p04 != p16"); + ASSERT(p16+p04 == p20, "p16+p04 != p20"); + ASSERT(p20+p04 == p24, "p20+p04 != p24"); if(_cy_r[0]) /* If it's a new exponent of a range test, need to deallocate these. */ { @@ -1197,12 +1197,12 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1237,7 +1237,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1288,7 +1288,7 @@ int radix28_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; }; } - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1648,8 +1648,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1659,8 +1659,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1669,27 +1669,27 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].s1p00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].s1p00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].s1p00r; - ASSERT(HERE, ((tmp + 0x38)->d0 == 2.0 && (tmp + 0x38)->d1 == 2.0), "thread-local memcheck failed!"); - ASSERT(HERE, ((tmp + half_arr_offset28-1)->d0 == crnd && (tmp + half_arr_offset28-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp + 0x38)->d0 == 2.0 && (tmp + 0x38)->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT(((tmp + half_arr_offset28-1)->d0 == crnd && (tmp + half_arr_offset28-1)->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp + half_arr_offset28+40)->d0 * (tmp + half_arr_offset28+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp + half_arr_offset28+40)->d1 * (tmp + half_arr_offset28+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp + half_arr_offset28+40)->d0 * (tmp + half_arr_offset28+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp + half_arr_offset28+40)->d1 * (tmp + half_arr_offset28+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp + half_arr_offset28+10)->d0 * (tmp + half_arr_offset28+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp + half_arr_offset28+10)->d1 * (tmp + half_arr_offset28+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp + half_arr_offset28+10)->d0 * (tmp + half_arr_offset28+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp + half_arr_offset28+10)->d1 * (tmp + half_arr_offset28+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1699,8 +1699,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp + half_arr_offset28)->d0 * (tmp + half_arr_offset28+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp + half_arr_offset28)->d1 * (tmp + half_arr_offset28+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp + half_arr_offset28)->d0 * (tmp + half_arr_offset28+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp + half_arr_offset28)->d1 * (tmp + half_arr_offset28+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1860,7 +1860,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy28_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy28_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1870,7 +1870,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2422,8 +2422,8 @@ void radix28_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2510,29 +2510,29 @@ void radix28_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20x16 bytes */ #endif - ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); #if defined(USE_AVX2) || defined(USE_ARM_V8_SIMD) // AVX2 (i.e. FMA)means non-Nussbaumer radix-7, uses these sincos constants: - ASSERT(HERE, (ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!"); + ASSERT((ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!"); #else /* SSE2 version assumes LO_ADD = 0, i.e. the low-mul Nussbaumer-style DFT implementation: */ - ASSERT(HERE, (ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!"); + ASSERT((ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!"); #endif - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix320_ditN_cy_dif1.c b/src/radix320_ditN_cy_dif1.c index 9da345b1..fdb8248f 100755 --- a/src/radix320_ditN_cy_dif1.c +++ b/src/radix320_ditN_cy_dif1.c @@ -324,7 +324,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -392,11 +392,11 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -434,7 +434,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -444,7 +444,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -484,24 +484,24 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix320_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix320_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix320_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -533,7 +533,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] sse2_rnd= tmp + 0x01; // sc_ptr += 0x(508 + a0) = 0x5b0; This is where the value of half_arr_offset320 comes from half_arr= tmp + 0x02; #endif - ASSERT(HERE, (radix320_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix320_creals_in_local_store checksum failed!"); + ASSERT((radix320_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix320_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(ycc1, cc1 ); // radix-5 DFT trig consts @@ -1203,12 +1203,12 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1232,7 +1232,7 @@ int radix320_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1385,8 +1385,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1396,8 +1396,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1406,26 +1406,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1555,7 +1555,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy320_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy320_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1565,7 +1565,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2678,8 +2678,8 @@ void radix320_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2906,21 +2906,21 @@ void radix320_dit_pass1(double a[], int n) half_arr= tmp + 0x02; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix32_dif_dit_pass.c b/src/radix32_dif_dit_pass.c index 4b9167f4..5b53e809 100755 --- a/src/radix32_dif_dit_pass.c +++ b/src/radix32_dif_dit_pass.c @@ -99,15 +99,15 @@ void radix32_dif_pass(double a[], int n, struct complex rt0[], struct complex rt { max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); if(sc_arr != 0x0) { // Have previously-malloc'ed local storage free((void *)sc_arr); sc_arr=0x0; } - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 64 16-byte slots of sc_arr for temporaries, next 7 for the nontrivial complex 32nd roots, last 64 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array. @@ -169,7 +169,7 @@ void radix32_dif_pass(double a[], int n, struct complex rt0[], struct complex rt /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); r00 = __r0 + thr_id*0x90; cc0 = r00 + 0x41; #endif @@ -197,8 +197,8 @@ void radix32_dif_pass(double a[], int n, struct complex rt0[], struct complex rt p14 = p14 + ( (p14 >> DAT_BITS) << PAD_BITS ); p18 = p18 + ( (p18 >> DAT_BITS) << PAD_BITS ); p1C = p1C + ( (p1C >> DAT_BITS) << PAD_BITS ); - ASSERT(HERE, p04+p04 == p08, "p04+p04 != p08"); - ASSERT(HERE, p04+p08 == p0C, "p04+p08 != p0C"); + ASSERT(p04+p04 == p08, "p04+p04 != p08"); + ASSERT(p04+p08 == p0C, "p04+p08 != p0C"); /*...The radix-32 pass is here. */ @@ -1477,15 +1477,15 @@ void radix32_dit_pass(double a[], int n, struct complex rt0[], struct complex rt { max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); if(sc_arr != 0x0) { // Have previously-malloc'ed local storage free((void *)sc_arr); sc_arr=0x0; } - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 64 16-byte slots of sc_arr for temporaries, next 7 for the nontrivial complex 32nd roots, last 64 for the doubled sincos twiddles, plus at least 3 more slots to allow for 64-byte alignment of the array. @@ -1545,7 +1545,7 @@ void radix32_dit_pass(double a[], int n, struct complex rt0[], struct complex rt /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); r00 = __r0 + thr_id*0x90; isrt2 = r00 + 0x40; cc0 = isrt2 + 1; diff --git a/src/radix32_ditN_cy_dif1.c b/src/radix32_ditN_cy_dif1.c index 4728a9e8..ec34bf81 100755 --- a/src/radix32_ditN_cy_dif1.c +++ b/src/radix32_ditN_cy_dif1.c @@ -324,11 +324,11 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong // printf("0: wt*inv-1 = %15.8e\n",fabs(wts_mult[0]*inv_mult[0] - 1.)); - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); // printf("1: wt*inv-1 = %15.8e\n",fabs(wts_mult[1]*inv_mult[1] - 1.)); #ifdef MULTITHREAD @@ -368,7 +368,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -379,7 +379,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], main_work_units = 0; pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif } @@ -426,23 +426,23 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread cslots_in_local_store = radix32_creals_in_local_store + (20+RADIX/2)/2; // Just add enough int64 space for both cases, plus some - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix32_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 64 vec_ddl-sized slots of sc_arr for temporaries, next 7 for the nontrivial complex 16th roots, next 32 for the vector carries, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -489,8 +489,8 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif -// ASSERT(HERE, half_arr_offset32 == (uint32)(half_arr-sc_ptr), "half_arr_offset32 mismatches actual!"); - ASSERT(HERE, (radix32_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix32_creals_in_local_store checksum failed!"); +// ASSERT(half_arr_offset32 == (uint32)(half_arr-sc_ptr), "half_arr_offset32 mismatches actual!"); + ASSERT((radix32_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix32_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); @@ -632,7 +632,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); } exit(0); #endif @@ -999,14 +999,14 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < NDIVR/CY_THREADS; j++) @@ -1026,7 +1026,7 @@ int radix32_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); if(CY_THREADS > 1) { for(ithread = 1; ithread < CY_THREADS; ithread++) @@ -1205,8 +1205,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1216,8 +1216,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1228,20 +1228,20 @@ for(outer=0; outer <= 1; outer++) // Dec 2015: fast-GCD usage of this routine may involve multiple 'main' arrays // on successive calls, so set here at runtime rather than in init-only block: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; - ASSERT(HERE, ((tmp + 0x40)->d0 == 2.0 && (tmp + 0x40)->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT(((tmp + 0x40)->d0 == 2.0 && (tmp + 0x40)->d1 == 2.0), "thread-local memcheck failed!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif @@ -1251,11 +1251,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1268,8 +1268,8 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_SSE2) // This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots: - dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1492,7 +1492,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy32_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy32_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1502,7 +1502,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix32_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1578,7 +1578,7 @@ for(outer=0; outer <= 1; outer++) // Must use NDIVR instead of p1 here since p1 may have pads which are not applied to element-2-slots-before j1 = NDIVR-2; j1 += ( (j1 >> DAT_BITS) << PAD_BITS ); j2 = j1+RE_IM_STRIDE; - ASSERT(HERE, t[RADIX-1].re <= 1.0 && t[RADIX-1].im <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!"); + ASSERT(t[RADIX-1].re <= 1.0 && t[RADIX-1].im <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!"); // Undo the initial dif pass just for the 16 complex terms in question: RADIX_32_DIT(\ a+j1,arr_offsets,RE_IM_STRIDE,\ @@ -1595,11 +1595,11 @@ for(outer=0; outer <= 1; outer++) // Verify that any cyout = 1 has the corresponding high word < 0, // then absorb cyout back into the high word and zero the carry: if(t[RADIX-1].re == 1.0) { - ASSERT(HERE, a[j1+p1C+p03] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!"); + ASSERT(a[j1+p1C+p03] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!"); a[j1+p1C+p03] += FFT_MUL_BASE; t[RADIX-1].re = 0.0; } if(t[RADIX-1].im == 1.0) { - ASSERT(HERE, a[j2+p1C+p03] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!"); + ASSERT(a[j2+p1C+p03] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!"); a[j2+p1C+p03] += FFT_MUL_BASE; t[RADIX-1].im = 0.0; } // Redo the initial dif pass just for the 16 complex terms in question: @@ -1940,8 +1940,8 @@ void radix32_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2028,10 +2028,10 @@ void radix32_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2041,18 +2041,18 @@ void radix32_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { #ifdef USE_AVX512 /* No-Op */ #else - dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } diff --git a/src/radix32_dyadic_square.c b/src/radix32_dyadic_square.c index 50c615a6..09338f5a 100755 --- a/src/radix32_dyadic_square.c +++ b/src/radix32_dyadic_square.c @@ -166,7 +166,7 @@ void radix32_dyadic_square( b = (double *)(fwd_fft_only & ~0xCull); // BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0: if(fwd_fft_only & 0xC) { - ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry + ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry fwd_fft_only = 3ull; } } @@ -184,9 +184,9 @@ void radix32_dyadic_square( /**************************************************************************************************************************************/ if((rad0save != radix0) || (nsave != n)) { - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); nsave = n; - ASSERT(HERE, N2 == n/2, "N2 bad!"); + ASSERT(N2 == n/2, "N2 bad!"); rad0save = radix0; ndivrad0 = n/radix0; ndivrad0m1 = ndivrad0-1; // ndivrad0 always a power of 2, so can do a fast-mod via & (ndivrad0-1) for(j = 0; j < ndivrad0; j += stride) @@ -194,7 +194,7 @@ void radix32_dyadic_square( j1 = j + ( (j >> DAT_BITS) << PAD_BITS ); if( (j1+stridh) != (j+stridh) + ( ((j+stridh) >> DAT_BITS) << PAD_BITS ) ) { printf("j, j1, stride/2 = %d,%d,%d, jpad = %d\n",j,j1, stridh, (j+stridh) + (((j+stridh) >> DAT_BITS) << PAD_BITS) ); - ASSERT(HERE, 0 , "add1 calculation violates padded index rules!"); + ASSERT(0 , "add1 calculation violates padded index rules!"); } } // Nov 2017: For the non-synthetic final-pass radices (16 and 32) the default contiguous-data chunksize @@ -214,9 +214,9 @@ void radix32_dyadic_square( ! Allocate and initialize an index array containing N/32 indices... index_ptmp = ALLOC_INT(N2/32); - if(!index_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array INDEX in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index_ptmp){ sprintf(cbuf,"ERROR: unable to allocate array INDEX in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } index = ALIGN_INT(index_ptmp); - if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index){ sprintf(cbuf,"ERROR: unable to allocate array ITMP in radix32_dyadic_square.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } for(i=0; i < N2/32; i++) { index[i]=i; @@ -226,11 +226,11 @@ void radix32_dyadic_square( index1_mod = (n>>6)/radix0; /* complex length requires an additional divide by 2 */ index_ptmp0 = ALLOC_INT(index_ptmp0, index0_mod); - if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index_ptmp0){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP0 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } index0 = ALIGN_INT(index_ptmp0); index_ptmp1 = ALLOC_INT(index_ptmp1, index1_mod); - if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!index_ptmp1){ sprintf(cbuf,"ERROR: unable to allocate array INDEX_PTMP1 in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } index1 = ALIGN_INT(index_ptmp1); for(i=0; i < index0_mod; i++){index0[i]= i;} @@ -248,7 +248,7 @@ void radix32_dyadic_square( if(i == radix0) break; } - if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(nradices_prim_radix0 >= nradices_prim) { sprintf(cbuf,"ERROR: nradices_prim_radix0 must be < nradices_prim in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } bit_reverse_int(index0, index0_mod, nradices_prim_radix0, &radix_prim[nradices_prim_radix0-1], -1, (int *)arr_scratch); bit_reverse_int(index1, index1_mod, nradices_prim-5-nradices_prim_radix0, &radix_prim[nradices_prim -6], -1, (int *)arr_scratch); @@ -263,10 +263,10 @@ void radix32_dyadic_square( if(init_sse2 <= max_threads) // current alloc sufficient return; - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif #ifdef USE_SSE2 @@ -276,14 +276,14 @@ void radix32_dyadic_square( } // Index vectors used in SIMD roots-computation. // Nov 2017: Add pair of int-slots per thread here ----vv, to support synthesized final-pass radices >= 256. - // sm_arr = ALLOC_INT(sm_arr, max_threads*(14*RE_IM_STRIDE+2) + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - sm_arr = ALLOC_INT(sm_arr, max_threads* 14*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + // sm_arr = ALLOC_INT(sm_arr, max_threads*(14*RE_IM_STRIDE+2) + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + sm_arr = ALLOC_INT(sm_arr, max_threads* 14*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sm_ptr = ALIGN_INT(sm_arr); - ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); // Twiddles-array: - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x94*max_threads + 100); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x94*max_threads + 100); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 64 vec_dbl slots of sc_arr for temporaries, next 8 for scratch, next 7 for the nontrivial complex 16th roots, next 62 for the doubled sincos twiddles, next 4 for [1.0,2.0,{0.25, unused in fermat-mod mode},sqrt2] and at least 3 more to allow for 64-byte alignment of the array. @@ -381,7 +381,7 @@ void radix32_dyadic_square( /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); #ifdef USE_SSE2 k1_arr = __i0 + thr_id*14*RE_IM_STRIDE; k2_arr = k1_arr + 7*RE_IM_STRIDE; @@ -432,9 +432,9 @@ void radix32_dyadic_square( #endif /*...If a new runlength, should not get to this point: */ - ASSERT(HERE, n == nsave,"n != nsave"); - ASSERT(HERE, incr == 64,"incr == 64"); -// ASSERT(HERE, ndivrad0 == n/radix0,"bad value for ndivrad0!"); Synthesized final-pass radices break this + ASSERT(n == nsave,"n != nsave"); + ASSERT(incr == 64,"incr == 64"); +// ASSERT(ndivrad0 == n/radix0,"bad value for ndivrad0!"); Synthesized final-pass radices break this /* k = ii*(ndivrad0 >> 6); */ @@ -1039,8 +1039,8 @@ printf("c[%2d] = %18.15f,%18.15f,%18.15f,%18.15f,%18.15f,%18.15f,%18.15f,%18.15f add0 = &a[j1]; add1 = &a[j1+stridh]; - // printf("stride = %d, add0,1 = %llX, %llX, diff = %llX\n",stride,(int64)add0,(int64)add1, (int64)add1-(int64)add0); exit(0); - // ASSERT(HERE, (j1+stride) == (j+stride) + ( ((j+stride) >> DAT_BITS) << PAD_BITS ) , "add1 calculation violates padded index rules!"); + // printf("stride = %d, add0,1 = %" PRIX64 ", %" PRIX64 ", diff = %" PRIX64 "\n",stride,(int64)add0,(int64)add1, (int64)add1-(int64)add0); exit(0); + // ASSERT((j1+stride) == (j+stride) + ( ((j+stride) >> DAT_BITS) << PAD_BITS ) , "add1 calculation violates padded index rules!"); #ifdef USE_AVX512 // The generic pre-dyadic-square macro needs 8 main-array addresses in AVX mode // because (add[1,3,5,7]-add[0,2,4,6]) have opposite signs for Fermat and Mersenne-mod: add1 = add0 + 64; diff --git a/src/radix32_wrapper_ini.c b/src/radix32_wrapper_ini.c index 59094d2c..fa40b1dd 100755 --- a/src/radix32_wrapper_ini.c +++ b/src/radix32_wrapper_ini.c @@ -75,7 +75,7 @@ void radix32_wrapper_ini(int n, int radix0, int iblock, int nradices_prim, int r ws_m [iblock_next] = m ; ws_blocklen [iblock_next] = blocklen ; ws_blocklen_sum[iblock_next] = blocklen_sum; - // printf("%8llu %20llu %8llu: init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k); + // printf("%8" PRIu64 " %20" PRIu64 " %8" PRIu64 ": init ws_k[%3d] = %10d\n",j1,((uint64)j1*radix0),j2,iblock_next,k); return; } jump_in: // Entry point for all blocks but the first. diff --git a/src/radix32_wrapper_square.c b/src/radix32_wrapper_square.c index 06732110..88687a5a 100755 --- a/src/radix32_wrapper_square.c +++ b/src/radix32_wrapper_square.c @@ -216,7 +216,7 @@ void radix32_wrapper_square( b = (double *)(fwd_fft_only & ~0xCull); // BUT, if bits 2:3 == 0, must avoid zeroing fwd_fft_only since "do 2-input dyadic-mul following fwd-FFT" relies on that != 0: if(fwd_fft_only & 0xC) { - ASSERT(HERE, (fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry + ASSERT((fwd_fft_only & 0xF) == 0xC,"Illegal value for bits 2:3 of fwd_fft_only!"); // Otherwise bits 2:3 should've been zeroed prior to entry fwd_fft_only = 3ull; } } @@ -235,10 +235,10 @@ void radix32_wrapper_square( nsave = n; if(init_sse2 > max_threads) // current SIMD local-alloc insufficient { - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif // printf("%Ns: max_threads = %d, NTHREADS = %d\n",func, max_threads, NTHREADS); @@ -250,14 +250,14 @@ void radix32_wrapper_square( // Index vectors used in SIMD roots-computation. // The AVX512 compute-sincos-mults code needs 2 elements per complex-double-load, so use 14*RE_IM_STRIDE per array // to alloc storage here for all cases, even though that leaves upper array halves unused for sub-AVX512. - sm_arr = ALLOC_INT(sm_arr, max_threads*28*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sm_arr = ALLOC_INT(sm_arr, max_threads*28*RE_IM_STRIDE + 16); if(!sm_arr){ sprintf(cbuf, "ERROR: unable to allocate sm_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sm_ptr = ALIGN_INT(sm_arr); - ASSERT(HERE, ((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); // Twiddles-array: Need 0x92 slots for data, plus need to leave room to pad-align. // v20: To support inline a*(b-c) for p-1 stage 2, need 2*RADIX = 64 added vec_dbl, thus 0x98 ==> 0xd8: - sc_arr = ALLOC_VEC_DBL(sc_arr, 0xd8*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0xd8*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 64 vec_dbl slots of sc_arr for temporaries, next 8 for scratch, next 7 for the nontrivial complex 16th roots, next 62 for the doubled sincos twiddles, next 4 for [1.0,2.0,0.25,sqrt2] and at least 3 more to allow for 64-byte alignment of the array. @@ -362,7 +362,7 @@ void radix32_wrapper_square( ! for the itmp space and that sent to the bit_reverse_int for scratch space ! don't overlap: */ - ASSERT(HERE, N2 == n/2, "N2 bad!"); + ASSERT(N2 == n/2, "N2 bad!"); itmp = (int *)&arr_scratch[N2/32]; /* Conservatively assume an int might be as long as 8 bytes here */ for(i=0; i < N2/32; i++) { @@ -452,7 +452,7 @@ void radix32_wrapper_square( free((void *)index_ptmp); index_ptmp=0x0; } index_ptmp = ALLOC_INT(index_ptmp, N2/32); - ASSERT(HERE, index_ptmp != 0,"ERROR: unable to allocate array INDEX!"); + ASSERT(index_ptmp != 0,"ERROR: unable to allocate array INDEX!"); index = ALIGN_INT(index_ptmp); /* !...Now rearrange FFT sincos indices using the main loop structure as a template. @@ -491,7 +491,7 @@ void radix32_wrapper_square( if(j2_start == n-64)break; blocklen_sum = blocklen_sum + blocklen; - ASSERT(HERE, i != 0,"ERROR 10!"); + ASSERT(i != 0,"ERROR 10!"); blocklen = (radix_prim[i-1]-1)*blocklen_sum; j2_start = j2_start+(blocklen<<2); @@ -511,7 +511,7 @@ void radix32_wrapper_square( /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); #ifdef USE_SSE2 k1_arr = __i0 + thr_id*28*RE_IM_STRIDE; k2_arr = k1_arr + 14*RE_IM_STRIDE; @@ -559,7 +559,7 @@ void radix32_wrapper_square( #endif #endif /*...If a new runlength, should not get to this point: */ - ASSERT(HERE, n == nsave,"n != nsave"); + ASSERT(n == nsave,"n != nsave"); /* ! SOLVING THE CACHE FLOW PROBLEM FOR BIT-REVERSED ARRAY DATA: diff --git a/src/radix352_ditN_cy_dif1.c b/src/radix352_ditN_cy_dif1.c index 7ca5e726..1a0a4085 100755 --- a/src/radix352_ditN_cy_dif1.c +++ b/src/radix352_ditN_cy_dif1.c @@ -340,7 +340,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -408,11 +408,11 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -450,7 +450,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -460,7 +460,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -500,24 +500,24 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix352_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix352_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix352_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -588,7 +588,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfadd(qt,qtheta); cq4 = qfcos(qt); sq4 = qfsin(qt); //================================================================ #endif - ASSERT(HERE, (radix352_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix352_creals_in_local_store checksum failed!"); + ASSERT((radix352_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix352_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); #if 1 @@ -1425,12 +1425,12 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1454,7 +1454,7 @@ int radix352_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1607,8 +1607,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1618,8 +1618,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1628,26 +1628,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1660,7 +1660,7 @@ for(outer=0; outer <= 1; outer++) /******************* AVX debug stuff: *******************/ #if 0 int ipad; - ASSERT(HERE, p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!"); + ASSERT(p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!"); // Use RNG to populate data array: rng_isaac_init(TRUE); double dtmp = 1024.0*1024.0*1024.0*1024.0; @@ -1935,7 +1935,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy352_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy352_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1945,7 +1945,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2901,8 +2901,8 @@ void radix352_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -3432,21 +3432,21 @@ void radix352_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix36_ditN_cy_dif1.c b/src/radix36_ditN_cy_dif1.c index f986e838..3f6b75e4 100755 --- a/src/radix36_ditN_cy_dif1.c +++ b/src/radix36_ditN_cy_dif1.c @@ -294,7 +294,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -356,11 +356,11 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -399,7 +399,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -409,7 +409,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -449,24 +449,24 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix36_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix36_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 192 16-byte slots of sc_arr for r-and-s temporaries, next 7 for the nontrivial complex 16th roots, next 36 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the   table lookup stuff, @@ -537,7 +537,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], sse2_rnd= tmp + 0x01; // sc_ptr += 2 = 0xa5 [avx] or 0xae [sse2]; This is where the value of half_arr_offset36 comes from half_arr= tmp + 0x02; /* This table needs 20x16 bytes */ - ASSERT(HERE, (radix36_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix36_creals_in_local_store checksum failed!"); + ASSERT((radix36_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix36_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); @@ -880,12 +880,12 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -909,7 +909,7 @@ int radix36_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1061,8 +1061,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1072,8 +1072,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1082,26 +1082,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1114,7 +1114,7 @@ for(outer=0; outer <= 1; outer++) /******************* AVX debug stuff: *******************/ #if 0 int ipad; - ASSERT(HERE, p01 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!"); + ASSERT(p01 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!"); // Use RNG to populate data array: rng_isaac_init(TRUE); double dtmp = 128.0*1024.0*1024.0*1024.0*1024.0; // 2^47 @@ -1389,7 +1389,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy36_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy36_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1399,7 +1399,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix40_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1855,8 +1855,8 @@ void radix36_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -1953,21 +1953,21 @@ void radix36_dit_pass1(double a[], int n) sse2_rnd= tmp + 0x01; half_arr= tmp + 0x02; /* This table needs 20x16 bytes */ - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix384_ditN_cy_dif1.c b/src/radix384_ditN_cy_dif1.c index 5acdadb9..a87b6dff 100755 --- a/src/radix384_ditN_cy_dif1.c +++ b/src/radix384_ditN_cy_dif1.c @@ -210,7 +210,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // Local storage: We must use an array here because scalars have no guarantees about relative address offsets // [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex) // to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros: - double *addr, *addi; + const double *addr, *addi; struct complex t[RADIX], *tptr; int err; static int first_entry=TRUE; @@ -310,7 +310,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array: @@ -376,11 +376,11 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -418,7 +418,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -428,7 +428,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -468,24 +468,24 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix384_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix384_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix384_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -548,8 +548,8 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif -// ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix384_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix384_creals_in_local_store checksum failed!"); +// ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix384_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix384_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); @@ -1102,12 +1102,12 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1131,7 +1131,7 @@ int radix384_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1283,8 +1283,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1294,8 +1294,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1304,26 +1304,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1336,7 +1336,7 @@ for(outer=0; outer <= 1; outer++) /******************* AVX debug stuff: *******************/ #if 0 int ipad; - ASSERT(HERE, p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!"); + ASSERT(p1 >= 16, "Smallest array-stride must be large enough to hold an AVX-512 vec_cmplx!"); // Use RNG to populate data array: rng_isaac_init(TRUE); double dtmp = 1024.0*1024.0*1024.0*1024.0; @@ -1608,7 +1608,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy384_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy384_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1618,7 +1618,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2410,8 +2410,8 @@ void radix384_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2690,21 +2690,21 @@ void radix384_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r000 == thread_arg->r000), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r000 == thread_arg->r000), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix384_main_carry_loop.h b/src/radix384_main_carry_loop.h index 37ea6bad..b835a670 100755 --- a/src/radix384_main_carry_loop.h +++ b/src/radix384_main_carry_loop.h @@ -229,8 +229,8 @@ if(tid == 1) { // In data-init we set target_idx = -1 on wraparound-carry mini-pass, so if() only taken on full pass: if(target_idx == j) { #ifdef USE_SSE2 - addr = (double *)s1p000 + target_set; - *addr += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor + double *addr_ = (double *)s1p000 + target_set; + *addr_ += target_cy*(n>>1); // target_cy = [-2 << within-word-shift]*[DWT weight]*n/2, i.e. includes fwd DWT weight and n/2 factor #else // target_set in [0,2*RADIX); tidx_mod_stride [even|odd] means shifted-carry goes into [Re|Im] part of the complex FFT datum: l = target_set&1; target_set >>= 1; @@ -529,26 +529,28 @@ if(tid == 1) { if(USE_SHORT_CY_CHAIN < USE_SHORT_CY_CHAIN_MAX) { // LOACC with tunable DWT-weights chaining /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy; for(l1 = 0; l1 < RADIX>>2; l1++) { jt = j1 + poff[l1]; jp = j2 + poff[l1]; // poff[] = p04,08,... // Re-init weights every 4th macro invocation to keep errors under control: - cmplx_carry_norm_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_fast_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_fast_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } else { // HiACC: /*...set0 is slightly different from others; divide work into blocks of 4 macro calls, 1st set of which gets pulled out of loop: */ - l = 0; addr = cy; itmp = bjmodn; + l = 0; itmp = bjmodn; + double *addr_ = cy; for(l1 = 0; l1 < RADIX>>2; l1++) { jt = j1 + poff[l1]; jp = j2 + poff[l1]; // poff[] = p04,08,... - cmplx_carry_norm_errcheck0(a[jt ],a[jp ],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_errcheck (a[jt+p1],a[jp+p1],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_errcheck (a[jt+p2],a[jp+p2],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; - cmplx_carry_norm_errcheck (a[jt+p3],a[jp+p3],*addr,*itmp,l,prp_mult); ++l; ++addr; ++itmp; + cmplx_carry_norm_errcheck0(a[jt ],a[jp ],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_errcheck (a[jt+p1],a[jp+p1],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_errcheck (a[jt+p2],a[jp+p2],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; + cmplx_carry_norm_errcheck (a[jt+p3],a[jp+p3],*addr_,*itmp,l,prp_mult); ++l; ++addr_; ++itmp; } } // LOACC or HIACC? diff --git a/src/radix4032_ditN_cy_dif1.c b/src/radix4032_ditN_cy_dif1.c index 7be1ae8a..538d93d0 100755 --- a/src/radix4032_ditN_cy_dif1.c +++ b/src/radix4032_ditN_cy_dif1.c @@ -388,11 +388,11 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -431,7 +431,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -441,7 +441,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -484,24 +484,24 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix4032_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix4032_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -529,13 +529,13 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset4032 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT(half_arr_offset4032 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { j = (1<<(2*(L2_SZ_VD-2))) + 4; // 16+4 for sse2, 64+4 for avx } else { j = ODD_RADIX<<2; // 4*ODD_RADIX } - ASSERT(HERE, (radix4032_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix4032_creals_in_local_store checksum failed!"); + ASSERT((radix4032_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix4032_creals_in_local_store checksum failed!"); /* SSE2 math = 53-mantissa-bit IEEE double-float: */ VEC_DBL_INIT(sse2_rnd, crnd); @@ -604,7 +604,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -979,12 +979,12 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1019,7 +1019,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1070,7 +1070,7 @@ int radix4032_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[ break; }; } // printf("wts_idx_incr = %u\n",wts_idx_incr); - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1504,8 +1504,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1515,8 +1515,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1525,26 +1525,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1554,8 +1554,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1776,7 +1776,7 @@ for(outer=0; outer <= 1; outer++) /*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/ for(j = 0; j < main_work_units; ++j) { - ASSERT(HERE, 0x0 == cy4032_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy4032_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1786,7 +1786,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2885,8 +2885,8 @@ void radix4032_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2945,23 +2945,23 @@ void radix4032_dit_pass1(double a[], int n) half_arr= tmp + 0x02; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix4032_main_carry_loop.h b/src/radix4032_main_carry_loop.h index 29ddc950..dcddb954 100755 --- a/src/radix4032_main_carry_loop.h +++ b/src/radix4032_main_carry_loop.h @@ -181,7 +181,7 @@ for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */ // (and only then: for all subsequent blocks it's superfluous), this assignment decrements co2 by radix(1). // *But*: since the init macro does an on-the-fly version of this between j,j+2 portions, external code co2=co3 must come *after* both ctmp-data octets are inited. #ifdef USE_AVX512 - ASSERT(HERE, 0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!"); + ASSERT(0, "AVX-512 version of AVX_cmplx_carry_fast_wtsinit_X8 not yet ported!"); #endif AVX_cmplx_carry_fast_wtsinit_X8(add1,add2,add3, itmp, half_arr,sign_mask, n_minus_sil,n_minus_silp1,sinwt,sinwtm1, sse_bw,sse_n) diff --git a/src/radix40_ditN_cy_dif1.c b/src/radix40_ditN_cy_dif1.c index 8af575b9..ed0473f0 100755 --- a/src/radix40_ditN_cy_dif1.c +++ b/src/radix40_ditN_cy_dif1.c @@ -278,7 +278,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], } if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); #ifdef USE_IMCI512 // WARN(HERE, "radix40_ditN_cy_dif1: No k1om / IMCI-512 support; Skipping this leading radix.", "", 1); return(ERR_RADIX0_UNAVAILABLE); @@ -343,11 +343,11 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -386,7 +386,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -396,7 +396,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -436,24 +436,24 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix40_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix40_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 80 16-byte slots of sc_arr for temporaries, next 5 for the nontrivial complex 16th roots, next 80 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -527,7 +527,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], sse2_rnd= tmp + 0x01; // sc_ptr += 180 [AVX] or 190 [SSE2]; This is where the value of half_arr_offset40 comes from half_arr= tmp + 0x02; /* This table needs 20x16 bytes */ - ASSERT(HERE, (radix40_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix40_creals_in_local_store checksum failed!"); + ASSERT((radix40_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix40_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); @@ -877,12 +877,12 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -906,7 +906,7 @@ int radix40_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1058,8 +1058,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1069,8 +1069,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1079,26 +1079,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1231,7 +1231,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy40_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy40_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1241,7 +1241,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix40_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1665,8 +1665,8 @@ void radix40_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -1769,21 +1769,21 @@ void radix40_dit_pass1(double a[], int n) sse2_rnd= tmp + 0x01; // sc_ptr += 180 [AVX] or 190 [SSE2]; This is where the value of half_arr_offset40 comes from half_arr= tmp + 0x02; /* This table needs 20x16 bytes */ - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix44_ditN_cy_dif1.c b/src/radix44_ditN_cy_dif1.c index 0c61a220..7cb1fbb6 100755 --- a/src/radix44_ditN_cy_dif1.c +++ b/src/radix44_ditN_cy_dif1.c @@ -325,7 +325,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -387,11 +387,11 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); } #ifdef MULTITHREAD @@ -431,7 +431,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -441,7 +441,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -481,24 +481,24 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix44_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix44_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix44_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x1f) == 0, "sm_ptr not 32-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x1f) == 0, "sm_ptr not 32-byte aligned!"); /* Use low 88x2 16-byte slots of sc_arr for temporaries, next 21 for the constants needed by the radix-11 DFT, next RADIX/2 = 22 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -552,7 +552,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (radix44_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix44_creals_in_local_store checksum failed!"); + ASSERT((radix44_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix44_creals_in_local_store checksum failed!"); #if (defined(USE_AVX2) && DFT_11_FMA) || defined(USE_ARM_V8_SIMD) /* no-op */ #else @@ -639,35 +639,35 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #if 0 //================================================================ // *** Cosine terms: *** - qt = qfsub(qfadd(cq0,cq2),qfadd(cq3,cq4)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a0, "a0"); /* a0 = ( cq0 - cq3+ cq2- cq4) */ - qt = qfsub(qfadd(cq1,cq2),qfadd(cq3,cq4)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a1, "a1"); /* a1 = ( cq1- cq3+ cq2- cq4) */ - qt = qfsub(qfadd(cq1,cq2),qfadd(cq0,cq3)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a3, "a3"); /* a3 = (- cq0+ cq1- cq3+ cq2 ) */ - qt = qfsub(qfadd(cq1,cq4),qfadd(cq0,cq3)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a4, "a4"); /* a4 = (- cq0+ cq1- cq3 + cq4) */ - qt = qfsub(cq2,cq3); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a6, "a6"); /* a6 = ( - cq3+ cq2 ) */ - qt = qfsub(cq1,cq3); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a7, "a7"); /* a7 = ( cq1- cq3 ) */ - qt = qfmul(qfifth, qfsub( qfmul_pow2(cq3,2), qfadd(qfadd(cq0,cq1),qfadd(cq2,cq4)) )); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a8, "a8"); /* a8 = (- cq0- cq1+4*cq3- cq2- cq4)/5 */ - qt = qfsub( qfmul(qfifth, qfadd( cq0 , qfadd(qfadd(cq1,cq2),qfadd(cq3,cq4)) )), QONE); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a9, "a9"); /* a9 = ( cq0+ cq1+ cq3+ cq2+ cq4)/5 - 1 */ + qt = qfsub(qfadd(cq0,cq2),qfadd(cq3,cq4)); dtmp = qfdbl(qt); ASSERT(dtmp == a0, "a0"); /* a0 = ( cq0 - cq3+ cq2- cq4) */ + qt = qfsub(qfadd(cq1,cq2),qfadd(cq3,cq4)); dtmp = qfdbl(qt); ASSERT(dtmp == a1, "a1"); /* a1 = ( cq1- cq3+ cq2- cq4) */ + qt = qfsub(qfadd(cq1,cq2),qfadd(cq0,cq3)); dtmp = qfdbl(qt); ASSERT(dtmp == a3, "a3"); /* a3 = (- cq0+ cq1- cq3+ cq2 ) */ + qt = qfsub(qfadd(cq1,cq4),qfadd(cq0,cq3)); dtmp = qfdbl(qt); ASSERT(dtmp == a4, "a4"); /* a4 = (- cq0+ cq1- cq3 + cq4) */ + qt = qfsub(cq2,cq3); dtmp = qfdbl(qt); ASSERT(dtmp == a6, "a6"); /* a6 = ( - cq3+ cq2 ) */ + qt = qfsub(cq1,cq3); dtmp = qfdbl(qt); ASSERT(dtmp == a7, "a7"); /* a7 = ( cq1- cq3 ) */ + qt = qfmul(qfifth, qfsub( qfmul_pow2(cq3,2), qfadd(qfadd(cq0,cq1),qfadd(cq2,cq4)) )); dtmp = qfdbl(qt); ASSERT(dtmp == a8, "a8"); /* a8 = (- cq0- cq1+4*cq3- cq2- cq4)/5 */ + qt = qfsub( qfmul(qfifth, qfadd( cq0 , qfadd(qfadd(cq1,cq2),qfadd(cq3,cq4)) )), QONE); dtmp = qfdbl(qt); ASSERT(dtmp == a9, "a9"); /* a9 = ( cq0+ cq1+ cq3+ cq2+ cq4)/5 - 1 */ qs = qfadd(qfadd(cq0,cq1), cq2); qs = qfmul_pow2(qs,1); // 2*(cq0+cq1+cq2) qt = qfadd(cq3,cq4); qt = qfadd(qt, qfmul_pow2(qt,1)); // 3*(cq3+cq4) - qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a2, "a2"); /* a2 = (-2*cq0-2*cq1+3*cq3-2*cq2+3*cq4)/5 */ + qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(dtmp == a2, "a2"); /* a2 = (-2*cq0-2*cq1+3*cq3-2*cq2+3*cq4)/5 */ qs = qfadd(qfadd(cq4,cq1), cq2); qs = qfmul_pow2(qs,1); // 2*(cq4+cq1+cq2) qt = qfadd(cq3,cq0); qt = qfadd(qt, qfmul_pow2(qt,1)); // 3*(cq3+cq0) - qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == a5, "a5"); /* a5 = ( 3*cq0-2*cq1+3*cq3-2*cq2-2*cq4)/5 */ + qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(dtmp == a5, "a5"); /* a5 = ( 3*cq0-2*cq1+3*cq3-2*cq2-2*cq4)/5 */ // *** Sine terms: *** - qt = qfsub(qfadd(sq0,sq2),qfadd(sq3,sq4)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b0, "b0"); /* b0 = ( sq0 - sq3+ sq2- sq4) */ - qt = qfsub(qfsub(sq2,sq1),qfadd(sq3,sq4)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b1, "b1"); /* b1 = ( -sq1- sq3+ sq2- sq4) */ - qt = qfsub(qfsub(sq2,sq1),qfadd(sq0,sq3)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b3, "b3"); /* b3 = (- sq0- sq1- sq3+ sq2 ) */ - qt = qfsub(qfsub(sq4,sq1),qfadd(sq0,sq3)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b4, "b4"); /* b4 = (- sq0- sq1- sq3 + sq4) */ - qt = qfsub(sq2,sq3); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b6, "b6"); /* b6 = ( - sq3+ sq2 ) */ - qt = qfneg(qfadd(sq1,sq3)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b7, "b7"); /* b7 = ( -sq1- sq3 ) */ - qt = qfmul(qfifth, qfsub( qfmul_pow2(sq3,2), qfadd(qfsub(sq0,sq1),qfadd(sq2,sq4)) )); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b8, "b8"); /* b8 = (- sq0+ sq1+4*sq3- sq2- sq4)/5 */ - qt = qfmul(qfifth, qfadd( sq0 , qfadd(qfsub(sq2,sq1),qfadd(sq3,sq4)) )); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b9, "b9"); /* b9 = ( sq0- sq1+ sq3+ sq2+ sq4)/5 - 1 */ + qt = qfsub(qfadd(sq0,sq2),qfadd(sq3,sq4)); dtmp = qfdbl(qt); ASSERT(dtmp == b0, "b0"); /* b0 = ( sq0 - sq3+ sq2- sq4) */ + qt = qfsub(qfsub(sq2,sq1),qfadd(sq3,sq4)); dtmp = qfdbl(qt); ASSERT(dtmp == b1, "b1"); /* b1 = ( -sq1- sq3+ sq2- sq4) */ + qt = qfsub(qfsub(sq2,sq1),qfadd(sq0,sq3)); dtmp = qfdbl(qt); ASSERT(dtmp == b3, "b3"); /* b3 = (- sq0- sq1- sq3+ sq2 ) */ + qt = qfsub(qfsub(sq4,sq1),qfadd(sq0,sq3)); dtmp = qfdbl(qt); ASSERT(dtmp == b4, "b4"); /* b4 = (- sq0- sq1- sq3 + sq4) */ + qt = qfsub(sq2,sq3); dtmp = qfdbl(qt); ASSERT(dtmp == b6, "b6"); /* b6 = ( - sq3+ sq2 ) */ + qt = qfneg(qfadd(sq1,sq3)); dtmp = qfdbl(qt); ASSERT(dtmp == b7, "b7"); /* b7 = ( -sq1- sq3 ) */ + qt = qfmul(qfifth, qfsub( qfmul_pow2(sq3,2), qfadd(qfsub(sq0,sq1),qfadd(sq2,sq4)) )); dtmp = qfdbl(qt); ASSERT(dtmp == b8, "b8"); /* b8 = (- sq0+ sq1+4*sq3- sq2- sq4)/5 */ + qt = qfmul(qfifth, qfadd( sq0 , qfadd(qfsub(sq2,sq1),qfadd(sq3,sq4)) )); dtmp = qfdbl(qt); ASSERT(dtmp == b9, "b9"); /* b9 = ( sq0- sq1+ sq3+ sq2+ sq4)/5 - 1 */ qs = qfadd(qfsub(sq0,sq1), sq2); qs = qfmul_pow2(qs,1); // 2*(sq0-sq1+sq2) qt = qfadd(sq3,sq4); qt = qfadd(qt, qfmul_pow2(qt,1)); // 3*(sq3+sq4) - qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b2, "b2"); /* b2 = (-2*sq0+2*sq1+3*sq3-2*sq2+3*sq4)/5 */ + qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(dtmp == b2, "b2"); /* b2 = (-2*sq0+2*sq1+3*sq3-2*sq2+3*sq4)/5 */ qs = qfadd(qfsub(sq4,sq1), sq2); qs = qfmul_pow2(qs,1); // 2*(sq4-sq1+sq2) qt = qfadd(sq3,sq0); qt = qfadd(qt, qfmul_pow2(qt,1)); // 3*(sq3+sq0) - qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(HERE, dtmp == b5, "b5"); /* b5 = ( 3*sq0+2*sq1+3*sq3-2*sq2-2*sq4)/5 */ + qt = qfmul(qfifth, qfsub(qt,qs)); dtmp = qfdbl(qt); ASSERT(dtmp == b5, "b5"); /* b5 = ( 3*sq0+2*sq1+3*sq3-2*sq2-2*sq4)/5 */ //================================================================ #endif #endif @@ -1042,12 +1042,12 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1071,7 +1071,7 @@ int radix44_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1223,8 +1223,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1234,8 +1234,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1244,26 +1244,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1396,7 +1396,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy44_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy44_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1406,7 +1406,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix44_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -2065,8 +2065,8 @@ this means that the output permutation translates (in terms of of 4 radix-11 mac double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -2150,21 +2150,21 @@ this means that the output permutation translates (in terms of of 4 radix-11 mac half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix48_ditN_cy_dif1.c b/src/radix48_ditN_cy_dif1.c index 22eb6cc2..f0d50cba 100755 --- a/src/radix48_ditN_cy_dif1.c +++ b/src/radix48_ditN_cy_dif1.c @@ -299,7 +299,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -367,11 +367,11 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -410,7 +410,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -420,7 +420,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -460,24 +460,24 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix48_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix48_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix48_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -909,12 +909,12 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -938,7 +938,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1090,8 +1090,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1101,8 +1101,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1111,26 +1111,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1263,7 +1263,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy48_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy48_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1273,7 +1273,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -1754,8 +1754,8 @@ void radix48_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -1869,21 +1869,21 @@ void radix48_dit_pass1(double a[], int n) sse2_rnd= tmp + 0x01; // sc_ptr += 0xe2 = 226; This is where the value of half_arr_offset48 comes from half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*odd_radix] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (r00r == thread_arg->r00r), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00r == thread_arg->r00r), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix512_ditN_cy_dif1.c b/src/radix512_ditN_cy_dif1.c index c1e524f0..ed56d439 100755 --- a/src/radix512_ditN_cy_dif1.c +++ b/src/radix512_ditN_cy_dif1.c @@ -56,7 +56,7 @@ void radix512_dif_pass1(double a[], int n) // Local storage: We must use an array here because scalars have no guarantees about relative address offsets // [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex) // to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros: - double *addr,*addi; + const double *addr,*addi; #include "radix1024_twiddles.h" // Can share radix-1024 table, just use first 31 of 63 rows here struct complex t[RADIX], *tptr; @@ -69,7 +69,7 @@ void radix512_dif_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n >> 9; @@ -350,7 +350,7 @@ void radix512_dit_pass1(double a[], int n) static int poffs[16],po_br[32]; // We prefer pointer-based array-element access, because that allows our radix16 DFT-with-twiddles // to look the same in terms of array-element arglists: - double *addr,*addi; + const double *addr,*addi; struct complex *tptr; #include "radix1024_twiddles.h" // Local storage: We must use an array here because scalars have no guarantees about relative address offsets diff --git a/src/radix52_ditN_cy_dif1.c b/src/radix52_ditN_cy_dif1.c index f621ac72..cabb6680 100755 --- a/src/radix52_ditN_cy_dif1.c +++ b/src/radix52_ditN_cy_dif1.c @@ -302,7 +302,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Init these to get rid of GCC "may be used uninitialized in this function" warnings: @@ -364,11 +364,11 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -407,7 +407,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -417,7 +417,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -457,24 +457,24 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of 88 dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix52_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix52_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 48 16-byte slots of sc_arr for temporaries, next 2 for the doubled cos and c3m1 terms, next 52/2 = 26 for the doubled carry pairs, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -524,7 +524,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r // sc_ptr += 256; This is where the value of half_arr_offset52 comes from half_arr= tmp + 0x02; /* This table needs 20x16 bytes for Mersenne-mod, and radixx16 for Fermat-mod */ #endif - ASSERT(HERE, (radix52_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix52_creals_in_local_store checksum failed!"); + ASSERT((radix52_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix52_creals_in_local_store checksum failed!"); /* These remain fixed: */ tmp = rad13_const-2; /* __cc pointer offsets: */ VEC_DBL_INIT(tmp, 1.0); ++tmp; /* -0x020 = 1.0 */ @@ -912,12 +912,12 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -941,7 +941,7 @@ const double cc1= 0.88545602565320989590, /* Real part of exp(i*2*pi/13), the r { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1093,8 +1093,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1104,8 +1104,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1114,26 +1114,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1266,7 +1266,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy52_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy52_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1276,7 +1276,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix52_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -1735,8 +1735,8 @@ void radix52_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -1818,21 +1818,21 @@ void radix52_dit_pass1(double a[], int n) sse2_rnd= tmp + 0x01; half_arr= tmp + 0x02; /* This table needs 20x16 bytes for Mersenne-mod, and radixx16 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix56_ditN_cy_dif1.c b/src/radix56_ditN_cy_dif1.c index a7e236a1..6593b47b 100755 --- a/src/radix56_ditN_cy_dif1.c +++ b/src/radix56_ditN_cy_dif1.c @@ -421,7 +421,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(first_entry) { - ASSERT(HERE, LO_ADD,"LO_ADD"); + ASSERT(LO_ADD,"LO_ADD"); psave = p; nsave = n; radix_inv = qfdbl(qf_rational_quotient((int64)1, (int64)RADIX)); n2inv = qfdbl(qf_rational_quotient((int64)1, (int64)(n/2))); @@ -447,11 +447,11 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -489,7 +489,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -499,7 +499,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -542,24 +542,24 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of radix56_creals_in_local_store vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix56_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix56_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low slots of sc_arr for temporaries, next few for the nontrivial complex 16th roots, next few for the doubled carry pairs, next 2 for ROE and RND_CONST, next RADIX for the half_arr table lookup stuff, @@ -755,7 +755,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1147,12 +1147,12 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1187,7 +1187,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1238,7 +1238,7 @@ int radix56_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; }; } - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1657,8 +1657,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1668,8 +1668,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1678,19 +1678,19 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00r == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -1699,11 +1699,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1713,8 +1713,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1936,7 +1936,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy56_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy56_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1946,7 +1946,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2496,8 +2496,8 @@ void radix56_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2636,16 +2636,16 @@ void radix56_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, (two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); + ASSERT((two->d0 == 2.0 && two->d1 == 2.0), "thread-local memcheck failed!"); #if defined(USE_AVX2) || defined(USE_ARM_V8_SIMD) // AVX2 (i.e. FMA)means non-Nussbaumer radix-7, uses these sincos constants: - ASSERT(HERE, (ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!"); + ASSERT((ss3->d0 == 0.0 && ss3->d1 == 0.0), "thread-local memcheck failed!"); #else /* SSE2 version assumes LO_ADD = 0, i.e. the low-mul Nussbaumer-style DFT implementation: */ - ASSERT(HERE, (ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!"); + ASSERT((ss3->d0 == sx3 && ss3->d1 == sx3), "thread-local memcheck failed!"); #endif #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2654,15 +2654,15 @@ void radix56_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix60_ditN_cy_dif1.c b/src/radix60_ditN_cy_dif1.c index c2ddcad7..55c7aae3 100755 --- a/src/radix60_ditN_cy_dif1.c +++ b/src/radix60_ditN_cy_dif1.c @@ -457,11 +457,11 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong // printf("0: wt*inv-1 = %15.8e\n",fabs(wts_mult[0]*inv_mult[0] - 1.)); - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); // printf("1: wt*inv-1 = %15.8e\n",fabs(wts_mult[1]*inv_mult[1] - 1.)); #ifdef MULTITHREAD @@ -500,7 +500,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -510,7 +510,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -553,25 +553,25 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread, the latter of which // provide thread-local storage for int-data and tables // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix60_creals_in_local_store + (((12+RADIX/2)/2 + ODD_RADIX + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix60_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 2*RADIX vector-double-sized slots of sc_arr for s1p* temporaries, next 2*RADIX slots for r* temps, next RADIX slots for x and y in-place DFT temps, next 7 for the complex root combs needed for the radix-3 and -5 sub-DFTs, @@ -705,8 +705,8 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset60 == (uint32)(half_arr-sc_ptr), "half_arr_offset60 mismatches actual!"); - ASSERT(HERE, (radix60_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix60_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset60 == (uint32)(half_arr-sc_ptr), "half_arr_offset60 mismatches actual!"); + ASSERT((radix60_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix60_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(sse2_c3m1, c3m1); VEC_DBL_INIT(sse2_s , s ); @@ -790,7 +790,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1153,18 +1153,18 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(_cy_r[0]) /* If it's a new exponent of a range test, need to deallocate these. */ { - ASSERT(HERE, 0 != _i, "free(_i) but ptr = 0x0!"); + ASSERT(0 != _i, "free(_i) but ptr = 0x0!"); for(i= 0; i < RADIX; i++) { - ASSERT(HERE, 0 != _bjmodn[i], "free(_bjmodn[i]) but ptr = 0x0!"); - ASSERT(HERE, 0 != _cy_r[i], "free(_cy_r[i]) but ptr = 0x0!"); - ASSERT(HERE, 0 != _cy_i[i], "free(_cy_i[i]) but ptr = 0x0!"); + ASSERT(0 != _bjmodn[i], "free(_bjmodn[i]) but ptr = 0x0!"); + ASSERT(0 != _cy_r[i], "free(_cy_r[i]) but ptr = 0x0!"); + ASSERT(0 != _cy_i[i], "free(_cy_i[i]) but ptr = 0x0!"); } - ASSERT(HERE, 0 != _jstart, "free(_jstart) but ptr = 0x0!"); - ASSERT(HERE, 0 != _jhi, "free(_jhi) but ptr = 0x0!"); - ASSERT(HERE, 0 != _col, "free(_col) but ptr = 0x0!"); - ASSERT(HERE, 0 != _co2, "free(_co2) but ptr = 0x0!"); - ASSERT(HERE, 0 != _co3, "free(_co3) but ptr = 0x0!"); - ASSERT(HERE, 0 != _bjmodnini, "free(_bjmodnini) but ptr = 0x0!"); + ASSERT(0 != _jstart, "free(_jstart) but ptr = 0x0!"); + ASSERT(0 != _jhi, "free(_jhi) but ptr = 0x0!"); + ASSERT(0 != _col, "free(_col) but ptr = 0x0!"); + ASSERT(0 != _co2, "free(_co2) but ptr = 0x0!"); + ASSERT(0 != _co3, "free(_co3) but ptr = 0x0!"); + ASSERT(0 != _bjmodnini, "free(_bjmodnini) but ptr = 0x0!"); free((void *)_i ); _i = 0x0; for(i = 0; i < RADIX; i++) { @@ -1198,12 +1198,12 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1238,7 +1238,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1289,7 +1289,7 @@ int radix60_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; }; } - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1757,8 +1757,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1768,8 +1768,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1778,21 +1778,21 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; - ASSERT(HERE, ((tmp + 300)->d0 == c3m1 && (tmp + 300)->d1 == c3m1), "thread-local memcheck failed!"); + ASSERT(((tmp + 300)->d0 == c3m1 && (tmp + 300)->d1 == c3m1), "thread-local memcheck failed!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -1801,11 +1801,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1815,8 +1815,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -2038,7 +2038,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy60_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy60_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -2048,7 +2048,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix32_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -2630,8 +2630,8 @@ void radix60_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2780,7 +2780,7 @@ void radix60_dit_pass1(double a[], int n) y0e = tmp + 0x1c; tmp += 0x1e; - ASSERT(HERE, (tmp->d0 == tmp->d1) && (tmp->d0 == c3m1), "thread-local memcheck failed!"); + ASSERT((tmp->d0 == tmp->d1) && (tmp->d0 == c3m1), "thread-local memcheck failed!"); sse2_c3m1 = tmp + 0x00; sse2_s = tmp + 0x01; sse2_cn1 = tmp + 0x02; @@ -2815,10 +2815,10 @@ void radix60_dit_pass1(double a[], int n) // +20 = 390 complex, round up to nearby multiple of 4 #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2827,15 +2827,15 @@ void radix60_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix60_main_carry_loop.h b/src/radix60_main_carry_loop.h index 7ecc98f4..298470dd 100755 --- a/src/radix60_main_carry_loop.h +++ b/src/radix60_main_carry_loop.h @@ -644,7 +644,7 @@ for(k=1; k <= khi; k++) /* Do n/(radix(1)*nwt) outer loop executions... */ #ifdef USE_AVX512 // will never hit this since have same assert in preprocessing code - just a placeholder/reminder: - ASSERT(HERE, 0, "radix60_ditN_cy_dif1: No AVX-512 support for Fermat-mod; Skipping this leading radix."); + ASSERT(0, "radix60_ditN_cy_dif1: No AVX-512 support for Fermat-mod; Skipping this leading radix."); #else // AVX / AVX2 diff --git a/src/radix63_ditN_cy_dif1.c b/src/radix63_ditN_cy_dif1.c index 41d8438c..5fa18017 100755 --- a/src/radix63_ditN_cy_dif1.c +++ b/src/radix63_ditN_cy_dif1.c @@ -265,7 +265,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -275,7 +275,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -315,7 +315,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < 4, "Failed to align cy_dat array!"); + ASSERT(l < 4, "Failed to align cy_dat array!"); } #endif @@ -367,12 +367,12 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], } _maxerr = (double *)malloc(j); ptr_prod += (uint32)(_maxerr== 0x0); - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -407,7 +407,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -458,7 +458,7 @@ int radix63_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; }; } - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); /* Subtract nwt from the increments to ease fast-mod */ wts_idx_incr -= nwt; @@ -608,8 +608,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -619,8 +619,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = _maxerr[ithread]; @@ -629,11 +629,11 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* init carries */ @@ -732,7 +732,7 @@ for(outer=0; outer <= 1; outer++) /*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/ for(j = 0; j < main_work_units; ++j) { - ASSERT(HERE, 0x0 == cy63_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy63_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -742,7 +742,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ diff --git a/src/radix64_ditN_cy_dif1.c b/src/radix64_ditN_cy_dif1.c index 52470015..3fbc585e 100755 --- a/src/radix64_ditN_cy_dif1.c +++ b/src/radix64_ditN_cy_dif1.c @@ -401,11 +401,11 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -444,7 +444,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -454,7 +454,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -497,23 +497,23 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128 vec_dbl and ([8 if SSE2, 16 if AVX] + RADIX/2) uint64 element slots per thread cslots_in_local_store = radix64_creals_in_local_store + (20+RADIX/2)/2; // Just add enough int64 space for both cases, plus some - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix64_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); /* Use low 64 vec_ddl-sized slots of sc_arr for temporaries, next 7 for the nontrivial complex 16th roots, next 32 for the vector carries, next 2 for ROE and RND_CONST, next 20 for the half_arr table lookup stuff, @@ -679,8 +679,8 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif -// ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix64_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix64_creals_in_local_store checksum failed!"); +// ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix64_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix64_creals_in_local_store checksum failed!"); #if !USE_SCALAR_DFT_MACRO /* These remain fixed: */ @@ -691,7 +691,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], dtmp = *(double *)&isrt2_dn; VEC_DBL_INIT(isrt2, dtmp); VEC_DBL_INIT(nisrt2,-dtmp); VEC_DBL_INIT( isrt2, dtmp); // Copies of +ISRT2 needed for 30-asm-macro-operand-GCC-limit workaround: - VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(HERE, tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant + VEC_DBL_INIT( cc0, 1.0); VEC_DBL_INIT( ss0, 0.0); // tmp = cc0-1; ASSERT(tmp->d0 == ISRT2 && tmp->d1 == ISRT2, "tmp->d0,1 != ISRT2"); Disable to allow "round down" variant VEC_DBL_INIT( cc1, c64_1); VEC_DBL_INIT( ss1, s64_1); tmp = cc1-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc2, c32_1); VEC_DBL_INIT( ss2, s32_1); tmp = cc2-1; VEC_DBL_INIT(tmp, dtmp); VEC_DBL_INIT( cc3, c64_3); VEC_DBL_INIT( ss3, s64_3); tmp = cc3-1; VEC_DBL_INIT(tmp, dtmp); @@ -799,7 +799,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -1198,14 +1198,14 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < NDIVR/CY_THREADS; j++) @@ -1225,7 +1225,7 @@ int radix64_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); if(CY_THREADS > 1) { for(ithread = 1; ithread < CY_THREADS; ithread++) @@ -1404,8 +1404,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1415,8 +1415,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1425,19 +1425,19 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].r00; tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif @@ -1447,11 +1447,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1464,8 +1464,8 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_SSE2) // This is slightly different for power-of-2 DFTs: Here, scale is in the +2 slot, base & baseinv remain fixed in 0,+1 slots: - dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d0 * (tmp+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = tmp->d1 * (tmp+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1687,7 +1687,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy64_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy64_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1697,7 +1697,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } // printf("radix64_ditN_cy_dif1 end ; #tasks = %d, #free_tasks = %d\n", tpool->tasks_queue.num_tasks, tpool->free_tasks_queue.num_tasks); @@ -2524,8 +2524,8 @@ void radix64_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -2783,10 +2783,10 @@ Workaround: Compiled just this file with -O2, rest with usual -O3. half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -2796,18 +2796,18 @@ Workaround: Compiled just this file with -O2, rest with usual -O3. /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { #ifdef USE_AVX512 /* No-Op */ #else - dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d0 * (half_arr+1)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (half_arr)->d1 * (half_arr+1)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } diff --git a/src/radix768_ditN_cy_dif1.c b/src/radix768_ditN_cy_dif1.c index edec074a..4c10958a 100755 --- a/src/radix768_ditN_cy_dif1.c +++ b/src/radix768_ditN_cy_dif1.c @@ -311,7 +311,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(MODULUS_TYPE == MODULUS_TYPE_FERMAT) { - ASSERT(HERE, 0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); + ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!"); } // Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array: @@ -377,11 +377,11 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -419,7 +419,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -429,7 +429,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -469,24 +469,24 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use double-complex type size (16 bytes) to alloc a block of local storage // consisting of radix768_creals_in_local_store dcomplex and (12+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix768_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix768_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -541,8 +541,8 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif -// ASSERT(HERE, half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix768_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix768_creals_in_local_store checksum failed!"); +// ASSERT(half_arr_offset == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix768_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r000) + (20 << L2_SZ_VD), "radix768_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one, 1.0 ); @@ -1328,12 +1328,12 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays."); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/RADIX-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1357,7 +1357,7 @@ int radix768_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); #ifdef USE_PTHREAD /* Populate the elements of the thread-specific data structs which don't change after init: */ @@ -1509,8 +1509,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1520,8 +1520,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1530,26 +1530,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r000 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif #endif /* init carries: */ @@ -1679,7 +1679,7 @@ for(outer=0; outer <= 1; outer++) for(j = 0; j < main_work_units; ++j) { // printf("adding main task %d\n",j + pool_work_units); - ASSERT(HERE, 0x0 == cy768_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy768_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1689,7 +1689,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2668,8 +2668,8 @@ void radix768_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; /* constant index offsets for array load/stores are here. */ @@ -3107,21 +3107,21 @@ void radix768_dit_pass1(double a[], int n) half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, 2 for Fermat-mod */ #endif - ASSERT(HERE, (r000 == thread_arg->r000), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r000 == thread_arg->r000), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; #ifdef USE_AVX512 /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix7_ditN_cy_dif1.c b/src/radix7_ditN_cy_dif1.c index 280e714a..40ad49b6 100755 --- a/src/radix7_ditN_cy_dif1.c +++ b/src/radix7_ditN_cy_dif1.c @@ -119,7 +119,7 @@ int radix7_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(first_entry) { - ASSERT(HERE, LO_ADD,"radix7_ditN_cy_dif1.c: LO_ADD"); + ASSERT(LO_ADD,"radix7_ditN_cy_dif1.c: LO_ADD"); psave = p; nsave = n; first_entry=FALSE; diff --git a/src/radix8_dif_dit_pass.c b/src/radix8_dif_dit_pass.c index 0552bb27..fb0ac012 100755 --- a/src/radix8_dif_dit_pass.c +++ b/src/radix8_dif_dit_pass.c @@ -104,12 +104,12 @@ void radix8_dif_pass(double a[], int n, struct complex rt0[], struct complex rt1 { max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); - sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); + sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 16 16-byte slots of sc_arr for temporaries, next 16 for the doubled sincos twiddles, next 1 for doubled 1/sqrt2, plus at least 3 more slots to allow for 64-byte alignment of the array: @@ -191,7 +191,7 @@ void radix8_dif_pass(double a[], int n, struct complex rt0[], struct complex rt1 /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); c0 = __r0 + thr_id*36 + 0x10; c4 = c0 + 0x02; c2 = c0 + 0x04; @@ -564,12 +564,12 @@ void radix8_dit_pass(double a[], int n, struct complex rt0[], struct complex rt1 { max_threads = init_sse2; #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); - sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); + sc_arr = ALLOC_VEC_DBL(sc_arr, 36*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Use low 16 16-byte slots of sc_arr for temporaries, next 16 for the doubled sincos twiddles, next 1 for doubled 1/sqrt2, plus at least 3 more slots to allow for 64-byte alignment of the array: @@ -654,7 +654,7 @@ void radix8_dit_pass(double a[], int n, struct complex rt0[], struct complex rt1 /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); c0 = __r0 + thr_id*36 + 0x10; c4 = c0 + 0x02; c2 = c0 + 0x04; diff --git a/src/radix8_ditN_cy_dif1.c b/src/radix8_ditN_cy_dif1.c index 3b7414c7..b1c22dde 100755 --- a/src/radix8_ditN_cy_dif1.c +++ b/src/radix8_ditN_cy_dif1.c @@ -127,12 +127,12 @@ int radix8_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], if(CY_THREADS > MAX_THREADS) CY_THREADS = MAX_THREADS; - ASSERT(HERE, CY_THREADS >= NTHREADS,"radix8_ditN_cy_dif1.c: CY_THREADS < NTHREADS"); - ASSERT(HERE, isPow2(CY_THREADS) ,"radix8_ditN_cy_dif1.c: CY_THREADS not a power of 2!"); + ASSERT(CY_THREADS >= NTHREADS,"radix8_ditN_cy_dif1.c: CY_THREADS < NTHREADS"); + ASSERT(isPow2(CY_THREADS) ,"radix8_ditN_cy_dif1.c: CY_THREADS not a power of 2!"); if(CY_THREADS > 1) { - ASSERT(HERE, n8 %CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n8 %CY_THREADS != 0 ... likely more threads than this leading radix can handle."); - ASSERT(HERE, n_div_nwt%CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n_div_nwt%CY_THREADS != 0 ... likely more threads than this leading radix can handle."); + ASSERT(n8 %CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n8 %CY_THREADS != 0 ... likely more threads than this leading radix can handle."); + ASSERT(n_div_nwt%CY_THREADS == 0,"radix8_ditN_cy_dif1.c: n_div_nwt%CY_THREADS != 0 ... likely more threads than this leading radix can handle."); } #ifdef MULTITHREAD @@ -192,47 +192,47 @@ int radix8_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], free((void *)_bjmodnini); _bjmodnini = 0x0; } - _i = (int *)malloc(CY_THREADS*sizeof(int)); if(!_i ) { sprintf(cbuf,"ERROR: unable to allocate array _i in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn0 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn0){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn1 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn1){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn2 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn2){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn3 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn3){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn4 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn4){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn5 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn5){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn6 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn6){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _bjmodn7 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn7){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _jstart = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jstart ){ sprintf(cbuf,"ERROR: unable to allocate array _jstart in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _jhi = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jhi ){ sprintf(cbuf,"ERROR: unable to allocate array _jhi in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _col = (int *)malloc(CY_THREADS*sizeof(int)); if(!_col ){ sprintf(cbuf,"ERROR: unable to allocate array _col in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _co2 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co2 ){ sprintf(cbuf,"ERROR: unable to allocate array _co2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _co3 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co3 ){ sprintf(cbuf,"ERROR: unable to allocate array _co3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - - _cy_r0 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r1 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r2 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r3 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r4 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r5 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r6 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_r7 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - - _cy_i0 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i1 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i2 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i3 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i4 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i5 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i6 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - _cy_i7 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } - - _maxerr = (double *)malloc(CY_THREADS*sizeof(double)); if(!_maxerr){ sprintf(cbuf,"ERROR: unable to allocate array _maxerr in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _i = (int *)malloc(CY_THREADS*sizeof(int)); if(!_i ) { sprintf(cbuf,"ERROR: unable to allocate array _i in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn0 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn0){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn1 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn1){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn2 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn2){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn3 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn3){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn4 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn4){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn5 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn5){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn6 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn6){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _bjmodn7 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_bjmodn7){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodn7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _jstart = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jstart ){ sprintf(cbuf,"ERROR: unable to allocate array _jstart in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _jhi = (int *)malloc(CY_THREADS*sizeof(int)); if(!_jhi ){ sprintf(cbuf,"ERROR: unable to allocate array _jhi in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _col = (int *)malloc(CY_THREADS*sizeof(int)); if(!_col ){ sprintf(cbuf,"ERROR: unable to allocate array _col in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _co2 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co2 ){ sprintf(cbuf,"ERROR: unable to allocate array _co2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _co3 = (int *)malloc(CY_THREADS*sizeof(int)); if(!_co3 ){ sprintf(cbuf,"ERROR: unable to allocate array _co3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + + _cy_r0 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r1 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r2 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r3 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r4 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r5 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r6 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_r7 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_r7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_r7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + + _cy_i0 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i0){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i0 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i1 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i1){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i1 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i2 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i2){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i2 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i3 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i3){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i3 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i4 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i4){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i4 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i5 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i5){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i5 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i6 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i6){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i6 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + _cy_i7 = (double *)malloc(CY_THREADS*sizeof(double)); if(!_cy_i7){ sprintf(cbuf,"ERROR: unable to allocate array _cy_i7 in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } + + _maxerr = (double *)malloc(CY_THREADS*sizeof(double)); if(!_maxerr){ sprintf(cbuf,"ERROR: unable to allocate array _maxerr in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/8-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in radix8_ditN_cy_dif1.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; for(j=0; j < n8/CY_THREADS; j++) @@ -253,7 +253,7 @@ int radix8_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[], { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); } } /* endif(first_entry) */ @@ -635,7 +635,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,"radix8_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,"radix8_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ _cy_r0[ithread] = _cy_r0[ithread-1]; _cy_r1[ithread] = _cy_r1[ithread-1]; _cy_r2[ithread] = _cy_r2[ithread-1]; @@ -673,7 +673,7 @@ for(outer=0; outer <= 1; outer++) // Must use n8 instead of p1 here since p1 may have pads which are not applied to element-2-slots-before j1 = n8-2; j1 += ( (j1 >> DAT_BITS) << PAD_BITS ); j2 = j1+RE_IM_STRIDE; - ASSERT(HERE, t15 <= 1.0 && t16 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!"); + ASSERT(t15 <= 1.0 && t16 <= 1.0, "genFFTmul expects carryouts = 0 or 1 at top!"); // Undo the initial dif pass just for the 16 complex terms in question: RADIX_08_DIT(a[j1],a[j2],a[j1+p1],a[j2+p1],a[j1+p2],a[j2+p2],a[j1+p3],a[j2+p3],a[j1+p4],a[j2+p4],a[j1+p5],a[j2+p5],a[j1+p6],a[j2+p6],a[j1+p7],a[j2+p7] ,_t1,_t2,_t3,_t4,_t5,_t6,_t7,_t8,_t9,_t10,_t11,_t12,_t13,_t14,_t15,_t16 @@ -691,11 +691,11 @@ for(outer=0; outer <= 1; outer++) // Verify that any cyout = 1 has the corresponding high word < 0, // then absorb cyout back into the high word and zero the carry: if(t15 == 1.0) { - ASSERT(HERE, a[j1+p7] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!"); + ASSERT(a[j1+p7] < 0.0, "genFFTmul: Legal Re-cyout = 1 must have the corresponding high word < 0!"); a[j1+p7] += FFT_MUL_BASE; t15 = 0.0; } if(t16 == 1.0) { - ASSERT(HERE, a[j2+p7] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!"); + ASSERT(a[j2+p7] < 0.0, "genFFTmul: Legal Im-cyout = 1 must have the corresponding high word < 0!"); a[j2+p7] += FFT_MUL_BASE; t16 = 0.0; } // Redo the initial dif pass just for the 16 complex terms in question: @@ -707,7 +707,7 @@ for(outer=0; outer <= 1; outer++) for(ithread = CY_THREADS - 1; ithread > 0; ithread--) { - ASSERT(HERE, CY_THREADS > 1,"radix8_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ + ASSERT(CY_THREADS > 1,"radix8_ditN_cy_dif1.c: "); /* Make sure loop only gets executed if multiple threads */ _cy_r0[ithread] = _cy_r0[ithread-1]; _cy_i0[ithread] = _cy_i0[ithread-1]; _cy_r1[ithread] = _cy_r1[ithread-1]; _cy_i1[ithread] = _cy_i1[ithread-1]; _cy_r2[ithread] = _cy_r2[ithread-1]; _cy_i2[ithread] = _cy_i2[ithread-1]; diff --git a/src/radix960_ditN_cy_dif1.c b/src/radix960_ditN_cy_dif1.c index fc7433f5..f9ffe751 100755 --- a/src/radix960_ditN_cy_dif1.c +++ b/src/radix960_ditN_cy_dif1.c @@ -484,11 +484,11 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qt = qfexp(qt); // ...and get 2^x via exp[x*ln(2)]. wts_mult[0] = qfdbl(qt); // a = 2^(x/n), with x = sw inv_mult[0] = qfdbl(qfinv(qt)); // Double-based inversion (1.0 / wts_mult_a[0]) often gets LSB wrong - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); //curr have w, 2/w, separate-mul-by-1-or-0.5 gives [w,w/2] and [1/w,2/w] for i = 0,1, resp: wts_mult[1] = 0.5*wts_mult[0]; inv_mult[1] = 2.0*inv_mult[0]; - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); #ifdef MULTITHREAD @@ -526,7 +526,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -536,7 +536,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -579,24 +579,24 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < RE_IM_STRIDE, "Failed to align cy_dat array!"); + ASSERT(l < RE_IM_STRIDE, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix960_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix960_creals_in_local_store); - ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -666,8 +666,8 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 32 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset960 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix960_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix960_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset960 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix960_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (20 << L2_SZ_VD), "radix960_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(isrt2,ISRT2); @@ -759,7 +759,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] qc = qfcos(qt); qs = qfsin(qt); qx = QONE; qy = QZRO; for(j = 0; j < RADIX; j++) { - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; @@ -849,7 +849,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] } /************************************************************************/ else /* MODULUS_TYPE_MERSENNE: */ { /************************************************************************/ - ASSERT(HERE, tmp == half_arr, "tmp == half_arr check failed!"); + ASSERT(tmp == half_arr, "tmp == half_arr check failed!"); #ifdef USE_AVX512 /* Each lookup-category in the 'mini-tables' used in AVX mode balloons from 16x32-bytes to 64x64-bytes, so switch to an opmask-based scheme which starts with e.g. a broadcast constant and onditional doubling. @@ -1271,12 +1271,12 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1311,7 +1311,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1362,7 +1362,7 @@ int radix960_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; }; } // printf("wts_idx_incr = %u\n",wts_idx_incr); - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1796,8 +1796,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1807,8 +1807,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1817,21 +1817,21 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; #ifdef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts - ASSERT(HERE, ((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); - // ASSERT(HERE, ((tmp+0)->d0 == 0.50 && (tmp+0)->d1 == 0.50 && (tmp+0)->d2 == 0.50 && (tmp+0)->d3 == 0.50 && (tmp+0)->d4 == 0.50 && (tmp+0)->d5 == 0.50 && (tmp+0)->d6 == 0.50 && (tmp+0)->d7 == 0.50, "thread-local memcheck failed!"); - // ASSERT(HERE, ((tmp+1)->d0 == 0.25 && (tmp+1)->d1 == 0.25 && (tmp+1)->d2 == 0.25 && (tmp+1)->d3 == 0.25 && (tmp+1)->d4 == 0.25 && (tmp+1)->d5 == 0.25 && (tmp+1)->d6 == 0.25 && (tmp+1)->d7 == 0.25, "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == base[0] && (tmp-1)->d1 == baseinv[1] && (tmp-1)->d2 == wts_mult[1] && (tmp-1)->d3 == inv_mult[0]), "thread-local memcheck failed!"); + // ASSERT(((tmp+0)->d0 == 0.50 && (tmp+0)->d1 == 0.50 && (tmp+0)->d2 == 0.50 && (tmp+0)->d3 == 0.50 && (tmp+0)->d4 == 0.50 && (tmp+0)->d5 == 0.50 && (tmp+0)->d6 == 0.50 && (tmp+0)->d7 == 0.50, "thread-local memcheck failed!"); + // ASSERT(((tmp+1)->d0 == 0.25 && (tmp+1)->d1 == 0.25 && (tmp+1)->d2 == 0.25 && (tmp+1)->d3 == 0.25 && (tmp+1)->d4 == 0.25 && (tmp+1)->d5 == 0.25 && (tmp+1)->d6 == 0.25 && (tmp+1)->d7 == 0.25, "thread-local memcheck failed!"); #else - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -1840,11 +1840,11 @@ for(outer=0; outer <= 1; outer++) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1854,8 +1854,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -2076,7 +2076,7 @@ for(outer=0; outer <= 1; outer++) /*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/ for(j = 0; j < main_work_units; ++j) { - ASSERT(HERE, 0x0 == cy960_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy960_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -2086,7 +2086,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ @@ -2270,7 +2270,7 @@ void radix960_dif_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n/RADIX; @@ -2734,7 +2734,7 @@ void radix960_dit_pass1(double a[], int n) if(first_entry) { - ASSERT(HERE, (double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); + ASSERT((double *)t == &(t[0x00].re), "Unexpected value for Tmp-array-start pointer!"); first_entry=FALSE; NDIVR = n/RADIX; @@ -3410,8 +3410,8 @@ void radix960_dit_pass1(double a[], int n) double *wt1 = thread_arg->wt1; double *wts_mult = thread_arg->wts_mult; // Const Intra-block wts-multiplier... double *inv_mult = thread_arg->inv_mult; // ...and 2*(its multiplicative inverse). - ASSERT(HERE,fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); - ASSERT(HERE,fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[0]*inv_mult[0] - 1.0) < EPS, "wts_mults fail accuracy check!"); + ASSERT(fabs(wts_mult[1]*inv_mult[1] - 1.0) < EPS, "wts_mults fail accuracy check!"); int *si = thread_arg->si; struct complex *rn0 = thread_arg->rn0; struct complex *rn1 = thread_arg->rn1; @@ -3629,10 +3629,10 @@ void radix960_dit_pass1(double a[], int n) half_arr= tmp + 0x02; #endif - ASSERT(HERE, (r00 == thread_arg->r00), "thread-local memcheck failed!"); - ASSERT(HERE, (half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); + ASSERT((r00 == thread_arg->r00), "thread-local memcheck failed!"); + ASSERT((half_arr == thread_arg->half_arr), "thread-local memcheck failed!"); #ifndef USE_AVX512 // In AVX-512 mode, use VRNDSCALEPD for rounding and hijack this vector-data slot for the 4 base/baseinv-consts: - ASSERT(HERE, (sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); + ASSERT((sse2_rnd->d0 == crnd && sse2_rnd->d1 == crnd), "thread-local memcheck failed!"); #endif tmp = half_arr; if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) @@ -3641,15 +3641,15 @@ void radix960_dit_pass1(double a[], int n) /* No-Op */ #elif defined(USE_AVX) // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #else // SSE2: - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif } else { - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); } VEC_DBL_INIT(max_err, 0.0); diff --git a/src/radix992_ditN_cy_dif1.c b/src/radix992_ditN_cy_dif1.c index c826e6e6..ca0efd9b 100755 --- a/src/radix992_ditN_cy_dif1.c +++ b/src/radix992_ditN_cy_dif1.c @@ -387,7 +387,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] if(CY_THREADS > 1) { main_work_units = CY_THREADS/2; pool_work_units = CY_THREADS - main_work_units; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(pool_work_units, MAX_THREADS, pool_work_units, &thread_control)), "threadpool_init failed!"); printf("radix%d_ditN_cy_dif1: Init threadpool of %d threads\n", RADIX, pool_work_units); } else { main_work_units = 1; @@ -397,7 +397,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] #else pool_work_units = CY_THREADS; - ASSERT(HERE, 0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); + ASSERT(0x0 != (tpool = threadpool_init(CY_THREADS, MAX_THREADS, CY_THREADS, &thread_control)), "threadpool_init failed!"); #endif @@ -438,24 +438,24 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; } } - ASSERT(HERE, l < 4, "Failed to align cy_dat array!"); + ASSERT(l < 4, "Failed to align cy_dat array!"); } #endif #ifdef USE_SSE2 - ASSERT(HERE, ((uint32)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); - ASSERT(HERE, ((uint32)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); + ASSERT(((uint32)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!"); + ASSERT(((uint32)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!"); // Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage // consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread // (Add as many padding elts to the latter as needed to make it a multiple of 4): cslots_in_local_store = radix992_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3); - sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); sm_ptr = (uint64*)(sc_ptr + radix992_creals_in_local_store); - ASSERT(HERE, ((uint32)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); + ASSERT(((uint32)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!"); #ifdef USE_PTHREAD __r0 = sc_ptr; @@ -518,8 +518,8 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // This is where the value of half_arr_offset comes from half_arr= tmp + 0x02; /* This table needs 20 x 16 bytes for Mersenne-mod, and [4*ODD_RADIX] x 16 for Fermat-mod */ #endif - ASSERT(HERE, half_arr_offset992 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); - ASSERT(HERE, (radix992_creals_in_local_store << L2_SZ_VD) >= ((long)half_arr - (long)r00) + (20 << L2_SZ_VD), "radix992_creals_in_local_store checksum failed!"); + ASSERT(half_arr_offset992 == (uint32)(half_arr-sc_ptr), "half_arr_offset mismatches actual!"); + ASSERT((radix992_creals_in_local_store << L2_SZ_VD) >= ((long)half_arr - (long)r00) + (20 << L2_SZ_VD), "radix992_creals_in_local_store checksum failed!"); /* These remain fixed: */ VEC_DBL_INIT(isrt2,ISRT2); @@ -600,7 +600,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] // Up-multiply the complex exponential: qn = qfmul(qx, qc); qt = qfmul(qy, qs); qmul = qfsub(qn, qt); // Store qxnew in qmul for now. qn = qfmul(qx, qs); qt = qfmul(qy, qc); qy = qfadd(qn, qt); qx = qmul; - printf("j = %3u: cos = 0x%16llX\n",j,qfdbl_as_uint64(qx)); + printf("j = %3u: cos = %#16" PRIX64 "\n",j,qfdbl_as_uint64(qx)); } exit(0); #endif @@ -1109,12 +1109,12 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] _cy_i[i] = (double *)malloc(j); ptr_prod += (uint32)(_cy_i[i]== 0x0); } - ASSERT(HERE, ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); + ASSERT(ptr_prod == 0, "ERROR: unable to allocate one or more auxiliary arrays!"); /* Create (THREADS + 1) copies of _bjmodnini and use the extra (uppermost) one to store the "master" increment, i.e. the one that n2/radix-separated FFT outputs need: */ - _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + _bjmodnini = (int *)malloc((CY_THREADS + 1)*sizeof(int)); if(!_bjmodnini){ sprintf(cbuf,"ERROR: unable to allocate array _bjmodnini in %s.\n", func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } _bjmodnini[0] = 0; _bjmodnini[1] = 0; @@ -1149,7 +1149,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] { bjmodnini -= sw; bjmodnini = bjmodnini + ( (-(int)((uint32)bjmodnini >> 31)) & n); } - ASSERT(HERE, _bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); + ASSERT(_bjmodnini[CY_THREADS] == bjmodnini,"_bjmodnini[CY_THREADS] != bjmodnini"); // In non-power-of-2-runlength case, both Mersenne and Fermat-mod share these next 2 loops: if(CY_THREADS > 1) @@ -1200,7 +1200,7 @@ int radix992_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[] break; }; } - ASSERT(HERE, wts_idx_incr != 0, "wts_idx_incr init failed!"); + ASSERT(wts_idx_incr != 0, "wts_idx_incr init failed!"); #ifdef USE_SSE2 wts_idx_inc2 = wts_idx_incr << (2*L2_SZ_VD - 3); /* In the SIMD version, use icycle0-6 as actual address @@ -1585,8 +1585,8 @@ for(outer=0; outer <= 1; outer++) { tdat[ithread].iter = iter; // int data: - ASSERT(HERE, tdat[ithread].tid == ithread, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].tid == ithread, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].ndivr == NDIVR, "thread-local memcheck fail!"); tdat[ithread].khi = khi; tdat[ithread].i = _i[ithread]; /* Pointer to the BASE and BASEINV arrays. */ @@ -1596,8 +1596,8 @@ for(outer=0; outer <= 1; outer++) tdat[ithread].col = _col[ithread]; tdat[ithread].co2 = _co2[ithread]; tdat[ithread].co3 = _co3[ithread]; - ASSERT(HERE, tdat[ithread].sw == sw, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].sw == sw, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].nwt == nwt, "thread-local memcheck fail!"); // double data: tdat[ithread].maxerr = 0.0; @@ -1606,26 +1606,26 @@ for(outer=0; outer <= 1; outer++) // pointer data: tdat[ithread].arrdat = a; /* Main data array */ - ASSERT(HERE, tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].si == si, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt0 == wt0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wt1 == wt1, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].si == si, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn0 == rn0, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].rn1 == rn1, "thread-local memcheck fail!"); #ifdef USE_SSE2 - ASSERT(HERE, tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); - ASSERT(HERE, tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].wts_idx_inc2 == wts_idx_inc2, "thread-local memcheck fail!"); + ASSERT(tdat[ithread].r00 == __r0 + ithread*cslots_in_local_store, "thread-local memcheck fail!"); tmp = tdat[ithread].half_arr; - ASSERT(HERE, ((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); + ASSERT(((tmp-1)->d0 == crnd && (tmp-1)->d1 == crnd), "thread-local memcheck failed!"); #endif if(MODULUS_TYPE == MODULUS_TYPE_MERSENNE) { #ifdef USE_AVX // Grab some elt of base-data [offset by, say, +32] and mpy by its inverse [+16 further] - dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d0 * (tmp+56)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+40)->d1 * (tmp+56)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #elif defined(USE_SSE2) - dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(HERE, fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d0 * (tmp+14)->d0; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp+10)->d1 * (tmp+14)->d1; ASSERT(fabs(dtmp - 1.0) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1635,8 +1635,8 @@ for(outer=0; outer <= 1; outer++) else /* Fermat-mod uses "double helix" carry scheme - 2 separate sets of real/imaginary carries for right-angle transform, plus "twisted" wraparound step. */ { #ifdef USE_SSE2 - dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); - dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(HERE, fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d0 * (tmp+ODD_RADIX)->d0; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); + dtmp = (tmp)->d1 * (tmp+ODD_RADIX)->d1; ASSERT(fabs(dtmp - scale) < EPS, "thread-local memcheck failed!"); #endif /* init carries */ for(i = 0; i < RADIX; i++) { @@ -1795,7 +1795,7 @@ for(outer=0; outer <= 1; outer++) /*** Main execution thread executes remaining chunks in serial fashion (but in || with the pool threads): ***/ for(j = 0; j < main_work_units; ++j) { - ASSERT(HERE, 0x0 == cy992_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); + ASSERT(0x0 == cy992_process_chunk( (void*)(&tdat[j + pool_work_units]) ), "Main-thread task failure!"); } #endif @@ -1805,7 +1805,7 @@ for(outer=0; outer <= 1; outer++) ns_time.tv_nsec = 100000; // (long)nanoseconds - Get our desired 0.1 mSec as 10^5 nSec here while(tpool && tpool->free_tasks_queue.num_tasks != pool_work_units) { - ASSERT(HERE, 0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); + ASSERT(0 == mlucas_nanosleep(&ns_time), "nanosleep fail!"); } /* Copy the thread-specific output carry data back to shared memory: */ diff --git a/src/rng_isaac.c b/src/rng_isaac.c index 6110639f..1a96da9b 100755 --- a/src/rng_isaac.c +++ b/src/rng_isaac.c @@ -162,8 +162,8 @@ double rng_isaac_rand_double_norm_pos() /* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */ if(retval < 0.0 || retval > 1.0) { - sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16llx, iran64 = %16llx, retval = %lf not in [0,1]!\n", itmp64, iran64, retval); - ASSERT(HERE, 0, cbuf); + sprintf(cbuf, "rng_isaac_rand_double_norm_pos: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval); + ASSERT(0, cbuf); } return retval; } @@ -193,8 +193,8 @@ double rng_isaac_rand_double_norm_pm1() /* GCC compiler bug: needed to insert the explicit range-check here, otherwise compiler 'optimized' the (*(double *)&iran64) to zero: */ if(retval < -1.0 || retval > 1.0) { - sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16llx, iran64 = %16llx, retval = %lf not in [0,1]!\n", itmp64, iran64, retval); - ASSERT(HERE, 0, cbuf); + sprintf(cbuf, "rng_isaac_rand_double_norm_pm1: itmp64 = %16" PRIx64 ", iran64 = %16" PRIx64 ", retval = %lf not in [0,1]!\n", itmp64, iran64, retval); + ASSERT(0, cbuf); } return retval; } diff --git a/src/test_fft_radix.c b/src/test_fft_radix.c index 3036263e..fa3562e4 100755 --- a/src/test_fft_radix.c +++ b/src/test_fft_radix.c @@ -324,43 +324,43 @@ void test_fft_radix(void) index = ALLOC_INT(index , RADIX); dit_scramble = ALLOC_INT(dit_scramble, RADIX); /* double a[rmul*RADIX], b[rmul*RADIX], arrtmp[rmul*RADIX]: */ - ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX); ASSERT(HERE, (ptmp != 0x0), "FATAL: unable to allocate array A in test_fft_radix.\n"); + ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX); ASSERT((ptmp != 0x0), "FATAL: unable to allocate array A in test_fft_radix.\n"); a = ALIGN_DOUBLE(ptmp); ptmp = 0x0; ac = (struct complex *)a; - ASSERT(HERE, ((long)((void *)a) & 63) == 0x0,"test_fft_radix: A[] not aligned on 64-byte boundary!"); - ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX); ASSERT(HERE, (ptmp != 0x0), "FATAL: unable to allocate array B in test_fft_radix.\n"); + ASSERT(((long)((void *)a) & 63) == 0x0,"test_fft_radix: A[] not aligned on 64-byte boundary!"); + ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX); ASSERT((ptmp != 0x0), "FATAL: unable to allocate array B in test_fft_radix.\n"); b = ALIGN_DOUBLE(ptmp); ptmp = 0x0; - ASSERT(HERE, ((long)((void *)b) & 63) == 0x0,"test_fft_radix: B[] not aligned on 64-byte boundary!"); + ASSERT(((long)((void *)b) & 63) == 0x0,"test_fft_radix: B[] not aligned on 64-byte boundary!"); bc = (struct complex *)b; - ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX); ASSERT(HERE, (ptmp != 0x0), "FATAL: unable to allocate array A_ptmp in test_fft_radix.\n"); + ptmp = ALLOC_DOUBLE(ptmp, rmul*RADIX); ASSERT((ptmp != 0x0), "FATAL: unable to allocate array A_ptmp in test_fft_radix.\n"); arrtmp = ALIGN_DOUBLE(ptmp); ptmp = 0x0; - ASSERT(HERE, ((long)((void *)arrtmp) & 63) == 0x0,"test_fft_radix: arrtmp[] not aligned on 64-byte boundary!"); + ASSERT(((long)((void *)arrtmp) & 63) == 0x0,"test_fft_radix: arrtmp[] not aligned on 64-byte boundary!"); /* struct complex mat[radix][RADIX], *matp[RADIX]: */ - ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX); ASSERT(HERE, (ctmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n"); + ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX); ASSERT((ctmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n"); matp = ALIGN_POINTER(ctmpp,struct complex*); - ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX); ASSERT(HERE, (ctmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n"); + ctmpp = ALLOC_POINTER(ctmpp,struct complex*, RADIX); ASSERT((ctmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n"); mat = ALIGN_POINTER(ctmpp,struct complex*); for(i = 0; i < RADIX; ++i) { - ctmp = ALLOC_COMPLEX(ctmp, RADIX); ASSERT(HERE, (ctmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n"); + ctmp = ALLOC_COMPLEX(ctmp, RADIX); ASSERT((ctmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n"); mat[i] = ALIGN_COMPLEX(ctmp); ctmp = 0x0; /* Must re-init pointer so the realloc used by the ALLOC macro allocates new fresh memory for each row */ } #ifdef USE_FGT61 - iptr = ALLOC_UINT64(iptr, rmul*RADIX); ASSERT(HERE, (iptr != 0x0), "FATAL: unable to allocate array AMOD in test_fft_radix.\n"); + iptr = ALLOC_UINT64(iptr, rmul*RADIX); ASSERT((iptr != 0x0), "FATAL: unable to allocate array AMOD in test_fft_radix.\n"); amod = ALIGN_UINT64(iptr); iptr = 0x0; am = (uint128 *)amod; - ASSERT(HERE, ((long)((void *)amod) & 63) == 0x0,"test_fft_radix: AMOD[] not aligned on 64-byte boundary!"); - iptr = ALLOC_UINT64(iptr, rmul*RADIX); ASSERT(HERE, (iptr != 0x0), "FATAL: unable to allocate array BMOD in test_fft_radix.\n"); + ASSERT(((long)((void *)amod) & 63) == 0x0,"test_fft_radix: AMOD[] not aligned on 64-byte boundary!"); + iptr = ALLOC_UINT64(iptr, rmul*RADIX); ASSERT((iptr != 0x0), "FATAL: unable to allocate array BMOD in test_fft_radix.\n"); bmod = ALIGN_UINT64(iptr); iptr = 0x0; - ASSERT(HERE, ((long)((void *)bmod) & 63) == 0x0,"test_fft_radix: BMOD[] not aligned on 64-byte boundary!"); + ASSERT(((long)((void *)bmod) & 63) == 0x0,"test_fft_radix: BMOD[] not aligned on 64-byte boundary!"); bm = (uint128 *)bmod; - iptr = ALLOC_UINT64(iptr, rmul*RADIX); ASSERT(HERE, (iptr != 0x0), "FATAL: unable to allocate array A_iptr in test_fft_radix.\n"); - itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX); ASSERT(HERE, (itmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n"); + iptr = ALLOC_UINT64(iptr, rmul*RADIX); ASSERT((iptr != 0x0), "FATAL: unable to allocate array A_iptr in test_fft_radix.\n"); + itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX); ASSERT((itmpp != 0x0), "FATAL: unable to allocate array MATP in test_fft_radix.\n"); matmodp = ALIGN_POINTER(itmpp,uint128*); - itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX); ASSERT(HERE, (itmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n"); + itmpp = ALLOC_POINTER(itmpp,uint128*, RADIX); ASSERT((itmpp != 0x0), "FATAL: unable to allocate array MAT[][] in test_fft_radix.\n"); matmod = ALIGN_POINTER(itmpp,uint128*); for(i = 0; i < RADIX; ++i) { - itmp = ALLOC_UINT128(itmp, RADIX); ASSERT(HERE, (itmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n"); + itmp = ALLOC_UINT128(itmp, RADIX); ASSERT((itmp != 0x0), "FATAL: unable to allocate array Ctmp in test_fft_radix.\n"); matmod[i] = ALIGN_UINT128(itmp); itmp = 0x0; /* Must re-init pointer so the realloc used by the ALLOC macro allocates new fresh memory for each row */ } @@ -371,8 +371,8 @@ void test_fft_radix(void) /* Power-of-2 component of the DFT length: */ pow2 = 1 << trailz32(RADIX); podd = RADIX >> trailz32(RADIX); - ASSERT(HERE, RADIX == pow2*podd, "Radix decomposition failed!"); - ASSERT(HERE, (podd < 16 || podd == 31 || podd == 63), "test_fft_radix: Illegal radix; must be odd*2^n with odd = [3,5,7,9,11,13,15,31,63]"); + ASSERT(RADIX == pow2*podd, "Radix decomposition failed!"); + ASSERT((podd < 16 || podd == 31 || podd == 63), "test_fft_radix: Illegal radix; must be odd*2^n with odd = [3,5,7,9,11,13,15,31,63]"); /* These may not have been init'ed yet, so do it here: */ DAT_BITS = DAT_BITS_DEF; PAD_BITS = PAD_BITS_DEF; @@ -410,14 +410,14 @@ void test_fft_radix(void) #ifdef USE_FGT61 order = RADIX; prim_root_q(order, &root_re,&root_im); // RADIXth primitive root of unity // primitive 16th root of unity, scaled by *8: - ASSERT(HERE, root_re == 1693317751237720973ull && root_im == 2283815672160731785ull,"Bad prim-root[16]!");; + ASSERT(root_re == 1693317751237720973ull && root_im == 2283815672160731785ull,"Bad prim-root[16]!");; #endif for(i = 0; i < RADIX; i++) { theta = i*twopi/RADIX; #ifdef USE_FGT61 pow_modq((uint64)i, root_re,root_im, &m0,&m1); // m0,m1 = Ith power of prim-root - if(i == 0) ASSERT(HERE, m0 == 1ull && m1 == 0ull, "Bad 0th power of prim-root!"); + if(i == 0) ASSERT(m0 == 1ull && m1 == 0ull, "Bad 0th power of prim-root!"); rm = 1ull; im = 0ull; // leftmost col has [m0,m1]^0 = [1,0]... // printf("DFT-int matrix row %d:\n",i); #endif @@ -429,7 +429,7 @@ void test_fft_radix(void) #ifdef USE_FGT61 matmod[i][j].d0 = rm; matmod[i][j].d1 = im; - // printf("\t[%2d] = %20llu, %20llu\n",j, rm,im); + // printf("\t[%2d] = %20" PRIu64 ", %20" PRIu64 "\n",j, rm,im); cmul_modq(m0,m1, rm,im, &rm,&im); // ... [j]col has [m0,m1]^j rm = qreduce_full(rm); im = qreduce_full(im); #endif @@ -443,8 +443,8 @@ void test_fft_radix(void) #ifdef USE_SSE2 /* In SSE2 mode re-Init data array, using [re,re,im,im] data layout: */ i = RADIX-1; i = i + ( (i >> DAT_BITS) << PAD_BITS ); /* padded-array fetch index is here */ - ASSERT(HERE, i == RADIX-1, "for large radix, need to enable array padding in indexing here!!"); - ASSERT(HERE, rmul == 4, "!"); + ASSERT(i == RADIX-1, "for large radix, need to enable array padding in indexing here!!"); + ASSERT(rmul == 4, "!"); for(i = 0; i < RADIX ; i++) { a[ 2*i *RE_IM_STRIDE ] = ref[2*i ]; @@ -719,7 +719,7 @@ void test_fft_radix(void) l = (l - podd)%RADIX; if(l < 0) { l += RADIX; } } } else { // Odd-prime or odd-prime-power radix - ASSERT(HERE, (nradices == 1 && (radix_prim[0]&1)) + ASSERT((nradices == 1 && (radix_prim[0]&1)) || (nradices > 1 && (radix_prim[0] == radix_prim[1])), "Unexpected radix-decomposition!"); } @@ -957,7 +957,7 @@ void test_fft_radix(void) { j = index[i]; #ifdef USE_SSE2 - ASSERT(HERE, a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!"); + ASSERT(a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!"); #endif j1 = 2*i *RE_IM_STRIDE; // Real part j2 = (2*i+1)*RE_IM_STRIDE; // Imag part @@ -971,10 +971,10 @@ void test_fft_radix(void) // We only deploy FGT-based DFTs once the floating version has been tested, so no point doing the sorting // here, just compare using the (presumably correct) output-index permutations derived for the float code: #ifdef USE_FGT61 - printf("I = %3u: DIF-ref: %20llu %20llu, FGT: %20llu %20llu",i, bmod[2*j],bmod[2*j+1], amod[j1],amod[j2]); + printf("I = %3u: DIF-ref: %20" PRIu64 " %20" PRIu64 ", FGT: %20" PRIu64 " %20" PRIu64,i, bmod[2*j],bmod[2*j+1], amod[j1],amod[j2]); if(bmod[2*j] != amod[j1] || bmod[2*j+1] != amod[j2]) { if(bmod[2*j] != qreduce_full(amod[j1]) || bmod[2*j+1] != qreduce_full(amod[j2])) { - printf("\tDiff = %20lld %20lld\n",bmod[2*j]-amod[j1], bmod[2*j+1]-amod[j2]); + printf("\tDiff = %20" PRId64 " %20" PRId64 "\n",bmod[2*j]-amod[j1], bmod[2*j+1]-amod[j2]); } else { printf("\tMatch (mod q)\n"); } @@ -1058,7 +1058,7 @@ void test_fft_radix(void) avgerr *= iradix; printf("test_fft_radix: %d Mismatches detected in DIF DFT; maxerr = %15.10e, avgerr = %15.10e\n", nerr, maxerr, avgerr); printf("\n"); - ASSERT(HERE, nerr == 0, "test_fft_radix: Mismatches detected in DIF transform!"); + ASSERT(nerr == 0, "test_fft_radix: Mismatches detected in DIF transform!"); #endif @@ -1082,12 +1082,12 @@ void test_fft_radix(void) /*printf("J = [%3d]: add %6d, %6d\n",j,(int)a[2*j ],(int)a[2*j+1]);*/ } /*printf("sum[Re,Im] = %15.5f %15.5f\n",t0,t2);*/ - ASSERT(HERE, t0==t1 && t2==t3, "!"); + ASSERT(t0==t1 && t2==t3, "!"); for(i = 0; i < rmul*RADIX ; i+=2) { a[i ] = arrtmp[i ]; a[i+1] = arrtmp[i+1]; - ASSERT(HERE, a[i ] == a[i+1], "!"); + ASSERT(a[i ] == a[i+1], "!"); } #else @@ -1113,7 +1113,7 @@ void test_fft_radix(void) // Since we negated Im-part above, must analogize to q - (pre-negation)a[j2] here: amod[j1] = a[j1]; // Here, the cast-to-uint64 is implied by the assignment ... amod[j2] = q + (uint64)a[j2]; // ...but here need explicit cast to ensure integer addition. - printf("DIT-in[%2u]: float = [%10.5f,%10.5f]; int = [ %llu, q - %llu]\n",i, a[j1],a[j2] ,amod[j1],q - amod[j2]); + printf("DIT-in[%2u]: float = [%10.5f,%10.5f]; int = [ %" PRIu64 ", q - %" PRIu64 "]\n",i, a[j1],a[j2] ,amod[j1],q - amod[j2]); #endif } @@ -1290,10 +1290,10 @@ void test_fft_radix(void) // here, just compare using the (presumably correct) output-index permutations derived for the float code: #ifdef USE_FGT61 // Flip sign on Im-part of ref-outputs: - printf("I = %3u: DIT-ref: %20llu %20llu, FGT: %20llu %20llu",i, bmod[2*i],q-bmod[2*i+1], amod[j1],amod[j2]); + printf("I = %3u: DIT-ref: %20" PRIu64 " %20" PRIu64 ", FGT: %20" PRIu64 " %20" PRIu64,i, bmod[2*i],q-bmod[2*i+1], amod[j1],amod[j2]); if(bmod[2*i] != amod[j1] || q-bmod[2*i+1] != amod[j2]) { if(bmod[2*i] != qreduce_full(amod[j1]) || q-bmod[2*i+1] != qreduce_full(amod[j2])) { - printf("\tDiff = %20lld %20lld\n",bmod[2*i]-amod[j1], (q-bmod[2*i+1])-amod[j2]); + printf("\tDiff = %20" PRId64 " %20" PRId64 "\n",bmod[2*i]-amod[j1], (q-bmod[2*i+1])-amod[j2]); } else { printf("\tMatch (mod q)\n"); } @@ -1369,7 +1369,7 @@ void test_fft_radix(void) avgerr *= iradix; printf("test_fft_radix: %d Mismatches detected in DIT DFT; maxerr = %15.10e, avgerr = %15.10e\n", nerr, maxerr, avgerr); printf("\n"); - ASSERT(HERE, nerr == 0, "test_fft_radix: Mismatches detected in DIT transform!"); + ASSERT(nerr == 0, "test_fft_radix: Mismatches detected in DIT transform!"); #endif @@ -1380,7 +1380,7 @@ void test_fft_radix(void) for(i = 0; i < RADIX ; i++) { #ifdef USE_SSE2 - ASSERT(HERE, a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!"); + ASSERT(a[2*i*RE_IM_STRIDE] == a[2*i*RE_IM_STRIDE+1] && a[(2*i+1)*RE_IM_STRIDE] == a[(2*i+1)*RE_IM_STRIDE+1], "1/2 components of SSE2-pack mismatch!"); #endif j1 = 2*i *RE_IM_STRIDE; j2 = (2*i+1)*RE_IM_STRIDE; @@ -1396,10 +1396,10 @@ void test_fft_radix(void) printf("%4d %25.15f %25.15f, ERR= %15.10e\n",i,a[j1], a[j2], CABS(err_r, err_i)); } #ifdef USE_FGT61 - printf("I = %3u: DIF+DIT ref: [%lld,%lld], FGT: [%20llu,%20llu]",i, (uint64)arrtmp[2*i],(uint64)arrtmp[2*i+1], amod[j1]/RADIX,amod[j2]/RADIX); + printf("I = %3u: DIF+DIT ref: [%" PRId64 ",%" PRId64 "], FGT: [%20" PRIu64 ",%20" PRIu64 "]",i, (uint64)arrtmp[2*i],(uint64)arrtmp[2*i+1], amod[j1]/RADIX,amod[j2]/RADIX); if((uint64)arrtmp[2*i] != amod[j1]/RADIX || (uint64)arrtmp[2*i+1] != amod[j2]/RADIX) { if((uint64)arrtmp[2*i] != qreduce_full(amod[j1])/RADIX || (uint64)arrtmp[2*i+1] != qreduce_full(amod[j2])/RADIX) { - printf("\tMismatch! mod-outputs (mod RADIX) = [%20llu,%20llu]\n",amod[j1]%RADIX, amod[j2]%RADIX); + printf("\tMismatch! mod-outputs (mod RADIX) = [%20" PRIu64 ",%20" PRIu64 "]\n",amod[j1]%RADIX, amod[j2]%RADIX); } else { printf("\tMatch (mod q)\n"); } @@ -1411,7 +1411,7 @@ void test_fft_radix(void) avgerr *= iradix; printf("test_fft_radix: %d Mismatches detected in DIF/DIT combo; maxerr = %15.10e, avgerr = %15.10e\n", nerr, maxerr, avgerr); printf("\n"); - ASSERT(HERE, nerr == 0, "test_fft_radix: Mismatches detected in DIF/DIT combo!"); + ASSERT(nerr == 0, "test_fft_radix: Mismatches detected in DIF/DIT combo!"); #endif printf(""); @@ -1481,7 +1481,7 @@ void matmul_fgtmod(uint128 **mat, uint128 vec_in[], uint128 vec_out[], int nrow, cmul_modq(mat[i][j].d0,mat[i][j].d1, vec_in[j].d0,vec_in[j].d1, &rm,&im); // CMUL_MODQ outputs in 0,4b - must feed to qreduce() prior to accumulating: rm = qreduce(rm); im = qreduce(im); - // if(!i) printf("\t[%2d] = [%llu,%llu] * [%llu,%llu] = [%llu,%llu]\n",j, mat[i][j].d0,mat[i][j].d1, vec_in[j].d0,vec_in[j].d1, rm,im); + // if(!i) printf("\t[%2d] = [%" PRIu64 ",%" PRIu64 "] * [%" PRIu64 ",%" PRIu64 "] = [%" PRIu64 ",%" PRIu64 "]\n",j, mat[i][j].d0,mat[i][j].d1, vec_in[j].d0,vec_in[j].d1, rm,im); rm += vec_out[i].d0; im += vec_out[i].d1; // Normalize to ensure accumulated sum in [0,q-1]: diff --git a/src/threadpool.c b/src/threadpool.c index 94fef552..4dde1b9e 100644 --- a/src/threadpool.c +++ b/src/threadpool.c @@ -266,7 +266,7 @@ me at: heber.tomer@gmail.com #endif static void *worker_thr_routine(void *data) { - char cbuf[STR_MAX_LEN]; + char cbuf[STR_MAX_LEN*2]; #if INCLUDE_HWLOC char str[80]; #endif @@ -293,7 +293,7 @@ me at: heber.tomer@gmail.com i = mi64_ith_set_bit(CORE_SET, i+1, MAX_CORES>>6); // Remember, [i]th-bit index in arglist is *unit* offset, i.e. must be in [1,MAX_CORES] if(i < 0) { fprintf(stderr,"Affinity CORE_SET does not have a [%u]th set bit!",my_id % pool->num_of_cores); - ASSERT(HERE, 0, "Aborting."); + ASSERT(0, "Aborting."); } CPU_SET(i, &cpu_set); errcode = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof (cpu_set), &cpu_set); @@ -338,7 +338,7 @@ me at: heber.tomer@gmail.com i = mi64_ith_set_bit(CORE_SET, i+1, MAX_CORES>>6); // Remember, [i]th-bit index in arglist is *unit* offset, i.e. must be in [1,MAX_CORES] if(i < 0) { fprintf(stderr,"Affinity CORE_SET does not have a [%u]th set bit!",my_id % pool->num_of_cores); - ASSERT(HERE, 0, "Aborting."); + ASSERT(0, "Aborting."); } #if INCLUDE_HWLOC @@ -349,14 +349,14 @@ me at: heber.tomer@gmail.com if (obj) { hwloc_bitmap_or(cpuset, cpuset, obj->cpuset); } else { - snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: HWLOC_OBJ_PU[%u] not found.\n",i); + snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: HWLOC_OBJ_PU[%u] not found.\n",i); fprintf(stderr,"%s",cbuf); } // Set affinity to specified logical CPUs: if (hwloc_set_cpubind(hw_topology, cpuset, HWLOC_CPUBIND_THREAD)) { int error = errno; hwloc_bitmap_snprintf (str, sizeof (str), cpuset); - snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Warning: Unable to set affinity to cpuset %s: %s; leaving up to OS to manage thread/core binding.\n",str,strerror(error)); + snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Warning: Unable to set affinity to cpuset %s: %s; leaving up to OS to manage thread/core binding.\n",str,strerror(error)); fprintf(stderr,"%s",cbuf); #if THREAD_POOL_DEBUG } else { @@ -401,7 +401,7 @@ me at: heber.tomer@gmail.com i = mi64_ith_set_bit(CORE_SET, i+1, MAX_CORES>>6); // Remember, [i]th-bit index in arglist is *unit* offset, i.e. must be in [1,MAX_CORES] if(i < 0) { fprintf(stderr,"Affinity CORE_SET does not have a [%u]th set bit!",my_id % pool->num_of_cores); - ASSERT(HERE, 0, "Aborting."); + ASSERT(0, "Aborting."); } apolicy.affinity_tag = i; // set affinity tag #if THREAD_POOL_DEBUG diff --git a/src/twopmodq.c b/src/twopmodq.c index e4b3ae09..4c8f4349 100755 --- a/src/twopmodq.c +++ b/src/twopmodq.c @@ -46,7 +46,7 @@ uint64 test_modsqr64(uint64 x, uint64 q) uint64 qinv,t,hi,lo; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q & 0x1, "q must be odd!"); + ASSERT(q & 0x1, "q must be odd!"); qinv = (q+q+q) ^ (uint64)2; for(j = 0; j < 4; j++) { @@ -92,7 +92,7 @@ uint96 test_modsqr96(uint96 x, uint96 q) #endif /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q.d0 & 0x1, "q must be odd!"); + ASSERT(q.d0 & 0x1, "q must be odd!"); /* Init qinv = q. Since we're only interested in the bottom 3 bits of q, can use 64-bit math for that:*/ qinv.d0 = q.d0; qinv.d1 = (uint64)0; @@ -134,7 +134,7 @@ uint128 test_modsqr128(uint128 x, uint128 q) uint128 qinv,t,hi,lo; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q.d0 & 0x1, "q must be odd!"); + ASSERT(q.d0 & 0x1, "q must be odd!"); /* Init qinv = q. Since we're only interested in the bottom 3 bits of q, can use 64-bit math for that:*/ qinv.d0 = (q.d0+q.d0+q.d0) ^ (uint64)2; qinv.d1 = (uint64)0; @@ -178,7 +178,7 @@ uint128 test_modsqr128_96(uint128 x, uint128 q) uint128 qinv,t,lo; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q.d0 & 0x1, "q must be odd!"); + ASSERT(q.d0 & 0x1, "q must be odd!"); /* Init qinv = q. Since we're only interested in the bottom 3 bits of q, can use 64-bit math for that:*/ qinv.d0 = (q.d0+q.d0+q.d0) ^ (uint64)2; qinv.d1 = (uint64)0; @@ -233,7 +233,7 @@ uint32 test_twopmodq64(uint32 imax) #endif mi64_div(prod128, &q, 2,1, 0x0,&rem); // Omit quotient computation; remainder in rem if(rem != 1) { - fprintf(stderr,"Mismatch in test_twopmodq64: p = %llu; q = %llu: 2^[+|-p] (mod q) = %llu, %llu.\n",p,q,pos,neg); + fprintf(stderr,"Mismatch in test_twopmodq64: p = %" PRIu64 "; q = %" PRIu64 ": 2^[+|-p] (mod q) = %" PRIu64 ", %" PRIu64 ".\n",p,q,pos,neg); return 1; } } @@ -386,7 +386,7 @@ uint64 twopmodq63(uint64 p, uint64 q) */ /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q & 0x1, "q must be odd!"); + ASSERT(q & 0x1, "q must be odd!"); qinv = (q+q+q) ^ (uint64)2; for(j = 0; j < 4; j++) @@ -408,7 +408,7 @@ uint64 twopmodq63(uint64 p, uint64 q) { x = x + x - (q & -(x >= qhalf)); } - DBG_ASSERT(HERE, x < q, "twopmodq63 : x0 < q"); + DBG_ASSERT(x < q, "twopmodq63 : x0 < q"); #if FAC_DEBUG fprintf(stderr, "twopmodq63 : x0 = %s, q = %s\n", &str0[convert_uint64_base10_char(str0, x)], &str1[convert_uint64_base10_char(str1, q)] ); @@ -454,7 +454,7 @@ uint64 twopmodq63(uint64 p, uint64 q) uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) { //int dbg = ( (p == (1ull<<32)) && ( (k0 == 2958ull) || (k1 == 2958ull) || (k2 == 2958ull) || (k3 == 2958ull) ) ); -//if(dbg) printf("Hit! k0-3 = %llu, %llu, %llu, %llu\n",k0, k1, k2, k3); +//if(dbg) printf("Hit! k0-3 = %" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "\n",k0, k1, k2, k3); int32 j; uint32 q32_0, q32_1, q32_2, q32_3; uint32 qinv32_0, qinv32_1, qinv32_2, qinv32_3; @@ -483,7 +483,7 @@ uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) qhalf3 = q3>>1; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); // This gives 4-bit inverse: q32_0 = (uint32)q0; @@ -552,7 +552,7 @@ uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) x3 = x3 + x3 - (q3 & -(x3 >= qhalf3)); } -//if(dbg) printf("q1 = %llu: x1 = %llu\n",q1,x1); +//if(dbg) printf("q1 = %" PRIu64 ": x1 = %" PRIu64 "\n",q1,x1); for(j = start_index-2; j >= 0; j--) { /*...x^2 mod q is returned in x. On MIPS, we discard the lower half of DMULTU(q,x*y*qinv). */ @@ -618,14 +618,14 @@ uint64 twopmodq63_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) if(x2 >= q2) x2 -= q2; if(x3 >= q3) x3 -= q3; #endif -//if(dbg) printf("2*x^2 = %llu\n",x1); +//if(dbg) printf("2*x^2 = %" PRIu64 "\n",x1); } else { -//if(dbg) printf(" x^2 = %llu\n",x1); +//if(dbg) printf(" x^2 = %" PRIu64 "\n",x1); } } /*...Double and return. These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */ -//if(dbg) printf("xout = %llu\n",x1+x1-q1+FERMAT); +//if(dbg) printf("xout = %" PRIu64 "\n",x1+x1-q1+FERMAT); r = 0; if(x0+x0-q0+FERMAT == 1)r += 1; if(x1+x1-q1+FERMAT == 1)r += 2; @@ -659,7 +659,7 @@ uint64 twopmodq63_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6 pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); // This gives 4-bit inverse: q32_0 = (uint32)q0; @@ -927,7 +927,7 @@ uint64 twopmodq63_x8(uint64 q0, uint64 q1, uint64 q2, uint64 q3, uint64 q4, uint uint64 lead6, pshift6, qinv6, zshift6, x6, lo6, hi6; uint64 lead7, pshift7, qinv7, zshift7, x7, lo7, hi7; - DBG_ASSERT(HERE, (q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq63_x8: Inputs nonmonotone!"); + DBG_ASSERT((q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq63_x8: Inputs nonmonotone!"); pshift0 = q0 + 63; pshift1 = q1 + 63; @@ -977,7 +977,7 @@ uint64 twopmodq63_x8(uint64 q0, uint64 q1, uint64 q2, uint64 q3, uint64 q4, uint zshift7 = 63 - lead7; zshift7 <<= 1; pshift7 = ~pshift7; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); qinv0 = (q0+q0+q0) ^ (uint64)2; qinv1 = (q1+q1+q1) ^ (uint64)2; @@ -1195,7 +1195,7 @@ uint64 twopmmodq64(uint64 p, uint64 q) // debug = (q == 640126220763136ull); /* Uncomment (debug = ...) part and customize q to enable debug-printing */ uint32 curr_bit, leadb, start_index, nshift; uint64 pshift, qhalf, qinv, x, rsqr; - if(debug) printf("twopmmodq64: computing 2^%llu (mod %llu)\n",p,q); + if(debug) printf("twopmmodq64: computing 2^%" PRIu64 " (mod %" PRIu64 ")\n",p,q); // If p <= 64, directly compute 2^p (mod q): if(p < 64) return (1ull < p) % q; @@ -1209,7 +1209,7 @@ uint64 twopmmodq64(uint64 p, uint64 q) if(nshift) { // p >= nshift guaranteed here: q >>= nshift; p -= nshift; // Right-shift dividend by (nshift) bits; for 2^p this means subtracting nshift from p - if(debug) printf("Removed power-of-2 from q: q' = (q >> %u) = %llu\n",nshift,q); + if(debug) printf("Removed power-of-2 from q: q' = (q >> %u) = %" PRIu64 "\n",nshift,q); } qhalf = q>>1; /* = (q-1)/2, since q odd. */ // Extract leftmost 7 bits of (p - 64); if > 64, use leftmost 6 instead: @@ -1222,7 +1222,7 @@ uint64 twopmmodq64(uint64 p, uint64 q) start_index = 57-j; } /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, (q & 0x1) && (q > 1), "q must be odd > 1!"); + ASSERT((q & 0x1) && (q > 1), "q must be odd > 1!"); qinv = (q+q+q) ^ (uint64)2; for(j = 0; j < 4; j++) qinv = qinv*((uint64)2 - q*qinv); @@ -1237,16 +1237,16 @@ uint64 twopmmodq64(uint64 p, uint64 q) MONT_MUL64(x,rsqr, q,qinv, x); // x*R (mod q) = MONT_MUL(x,R^2 (mod q),q,qinv) } if(debug) { - printf("leadb = %u, x0 = %llu\n",leadb,x); - printf("pshift = p - %u = %llu\n",64,pshift); + printf("leadb = %u, x0 = %" PRIu64 "\n",leadb,x); + printf("pshift = p - %u = %" PRIu64 "\n",64,pshift); pow = leadb + 64; - printf("twopmmodq64: Initial power = 2^(%u+64) = 2^%u mod q' = %llu\n",leadb,pow,x); + printf("twopmmodq64: Initial power = 2^(%u+64) = 2^%u mod q' = %" PRIu64 "\n",leadb,pow,x); printf("twopmmodq64: Looping over %u remaining bits in power:\n",start_index); } for(j = start_index-1; j >= 0; j--) { curr_bit = (pshift >> j) & (uint64)1; MONT_SQR64(x,q,qinv,x); - if(debug) { pow = 2*pow + curr_bit - 64; printf("\tJ = %2u: [bit = %u]pow = %u, x = %llu\n",j,curr_bit,pow,x); } + if(debug) { pow = 2*pow + curr_bit - 64; printf("\tJ = %2u: [bit = %u]pow = %u, x = %" PRIu64 "\n",j,curr_bit,pow,x); } if(curr_bit) { if(x > qhalf) { /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ x = x + x; x -= q; @@ -1257,12 +1257,12 @@ uint64 twopmmodq64(uint64 p, uint64 q) } // May 2022: Since pre-subtracted 64 from computed powermod exponent, no need to un-scale the loop output anymore: // MONT_UNITY_MUL64(x,q,qinv,x); - if(debug) printf("pow = %u, x = %llu\n",pow,x); + if(debug) printf("pow = %u, x = %" PRIu64 "\n",pow,x); // If we applied an initial right-justify shift to the modulus, restore the shift to the // current (partial) remainder and re-add the off-shifted part of the true remainder. if(nshift) { x = (x << nshift);// + rem_save; - if(debug) printf("Restoring power-of-2: pow = %u, x *= 2^%u = %llu\n",pow+nshift,nshift,x); + if(debug) printf("Restoring power-of-2: pow = %u, x *= 2^%u = %" PRIu64 "\n",pow+nshift,nshift,x); } return x; } @@ -1337,8 +1337,8 @@ void twopmmodq64_q4(uint64 p, uint64 *i0, uint64 *i1, uint64 *i2, uint64 *i3, ui start_index = 58-j; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!"); - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); + ASSERT(q1 > 1 && q1 > 1 && q2 > 1 && q3 > 1 , "modulus must be > 1!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); qinv0 = (q0+q0+q0) ^ (uint64)2; qinv1 = (q1+q1+q1) ^ (uint64)2; @@ -1572,7 +1572,7 @@ uint64 twopmodq64(uint64 p, uint64 q) qhalf = q>>1; /* = (q-1)/2, since q odd. */ /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q & 1, "q must be odd!"); + ASSERT(q & 1, "q must be odd!"); qinv = (q+q+q) ^ (uint64)2; for(j = 0; j < 4; j++) { @@ -1595,7 +1595,7 @@ uint64 twopmodq64(uint64 p, uint64 q) x = x + x - (q & -(x >= qhalf)); } #if FAC_DEBUG -/* ASSERT(HERE, x < q, "twopmodq64: x0 < q"); */ +/* ASSERT(x < q, "twopmodq64: x0 < q"); */ #if 0 /* These appear to be benign: */ if(x >= q){ sprintf(char_buf, "twopmodq64: (x0 = %s) >= (q = %s)", &str0[convert_uint64_base10_char(str0, x)], &str1[convert_uint64_base10_char(str1, q)] ); DBG_WARN(HERE, char_buf, STATFILE, !restart); } #endif @@ -1661,7 +1661,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); // This gives 4-bit inverse: q32_0 = (uint32)q0; @@ -1834,7 +1834,7 @@ uint64 twopmodq64_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6 pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); // This gives 4-bit inverse: q32_0 = (uint32)q0; @@ -2101,11 +2101,11 @@ uint64 twopmodq65(uint64 p, uint64 k) if(dbg)printf("twopmodq65:\n"); #endif // Assume q is 65-bits here, so check that during construction of q = 2.k.p+1: - q = k*p; ASSERT(HERE, q+q < q, "q not 65 bits!"); + q = k*p; ASSERT(q+q < q, "q not 65 bits!"); q = (q << 1) + 1; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q & 1, "q must be odd!"); + ASSERT(q & 1, "q must be odd!"); qinv = (q+q+q) ^ (uint64)2; for(j = 0; j < 4; j++) { @@ -2147,7 +2147,7 @@ if(dbg)printf("twopmodq65:\n"); ! Get SQR_HIGH_EXACT(x) - MULH_EXACT(q, lo*qinv), normalizing as we go. */ #if FAC_DEBUG - if(dbg) printf("twopmodq65: while(%llu++ < %llu) || (%llu+=%llu < %llu)\n",A,B,hi,q,y); + if(dbg) printf("twopmodq65: while(%" PRIu64 "++ < %" PRIu64 ") || (%" PRIu64 "+=%" PRIu64 " < %" PRIu64 ")\n",A,B,hi,q,y); #endif while(A < B || (A == B && hi < y)) /* SQR_HIGH_EXACT(x) < MULH_EXACT(q, lo*qinv); add q until >= . */ { @@ -2157,7 +2157,7 @@ if(dbg)printf("twopmodq65:\n"); /* Do the subtraction. Result is in (hi,A). */ #if FAC_DEBUG - /*ASSERT(HERE, A > B || (A == B && hi >= y), "twopmodq65 : A > B || (A == B && hi >= y)"); */ + /*ASSERT(A > B || (A == B && hi >= y), "twopmodq65 : A > B || (A == B && hi >= y)"); */ #endif A -= B; x = hi; hi -= y; A -= (hi > x); /* had a borrow */ @@ -2165,25 +2165,25 @@ if(dbg)printf("twopmodq65:\n"); /* ...and normalize. Result is in (x, A).. */ x = hi; #if FAC_DEBUG - if(dbg) printf("twopmodq65: while(A=%llu-- > 1) || (%llu-=%llu >=%llu)\n",A,x,q,q); + if(dbg) printf("twopmodq65: while(A=%" PRIu64 "-- > 1) || (%" PRIu64 "-=%" PRIu64 " >=%" PRIu64 ")\n",A,x,q,q); #endif while(A > 1 || (A == 1 && x >= q)) { --A; x -= q; #if FAC_DEBUG - if(dbg) printf("twopmodq65: A = %llu, x-q = %llu, q = %llu, hi = %llu\n",A,x,q,hi); - if(dbg) printf("twopmodq65: (x > hi) = %llu\n",(x > hi)); - if(dbg) printf("twopmodq65: (x <=hi) = %llu\n",(x <=hi)); - if(dbg) printf("twopmodq65: (hi < x) = %llu\n",(hi < x)); - if(dbg) printf("twopmodq65: (hi <=x) = %llu\n",(hi <=x)); - if(dbg) printf("twopmodq65: (hi >=x) = %llu\n",(hi >=x)); - if(dbg) printf("twopmodq65: (hi -x ) = %llu - %llu = %llu\n",hi,x,(hi - x)); + if(dbg) printf("twopmodq65: A = %" PRIu64 ", x-q = %" PRIu64 ", q = %" PRIu64 ", hi = %" PRIu64 "\n",A,x,q,hi); + if(dbg) printf("twopmodq65: (x > hi) = %" PRIu64 "\n",(x > hi)); + if(dbg) printf("twopmodq65: (x <=hi) = %" PRIu64 "\n",(x <=hi)); + if(dbg) printf("twopmodq65: (hi < x) = %" PRIu64 "\n",(hi < x)); + if(dbg) printf("twopmodq65: (hi <=x) = %" PRIu64 "\n",(hi <=x)); + if(dbg) printf("twopmodq65: (hi >=x) = %" PRIu64 "\n",(hi >=x)); + if(dbg) printf("twopmodq65: (hi -x ) = %" PRIu64 " - %" PRIu64 " = %" PRIu64 "\n",hi,x,(hi - x)); if(dbg) printf("twopmodq65: (hi.-x.) = %lf - %lf = %lf \n",(double)hi,(double)x,((double)hi - (double)x)); #endif /* had a borrow: */ A -= (x > hi); #if FAC_DEBUG - ASSERT(HERE, ((double)x > (double)hi) == (x > hi),"((double)x > (double)hi) == (x > hi)"); - ASSERT(HERE, (int64)A >=0,"(int64)A >=0"); + ASSERT(((double)x > (double)hi) == (x > hi),"((double)x > (double)hi) == (x > hi)"); + ASSERT((int64)A >=0,"(int64)A >=0"); #endif } @@ -2195,7 +2195,7 @@ if(dbg)printf("twopmodq65:\n"); /* ...and normalize the result. */ #if FAC_DEBUG - if(dbg) printf("twopmodq65: while(B=%llu-- > 1) || (%llu-=%llu >=%llu)\n",B,y,q,q); + if(dbg) printf("twopmodq65: while(B=%" PRIu64 "-- > 1) || (%" PRIu64 "-=%" PRIu64 " >=%" PRIu64 ")\n",B,y,q,q); #endif while(B > 1 || (B == 1 && y >= q)) { @@ -2214,14 +2214,14 @@ if(dbg)printf("twopmodq65:\n"); /* ...and normalize the result. */ #if FAC_DEBUG - if(dbg) printf("twopmodq65:#while(%llu-- > 1) || (%llu-=%llu >=%llu)\n",B,y,q,q); + if(dbg) printf("twopmodq65:#while(%" PRIu64 "-- > 1) || (%" PRIu64 "-=%" PRIu64 " >=%" PRIu64 ")\n",B,y,q,q); #endif while(B > 1 || (B == 1 && y >= q)) { --B; t = y; y -= q; B -= (y > t); /* had a borrow */ } - /*if(y == (uint64)1) ASSERT(HERE, B == 0, "twopmodq65 : B == 0");*/ + /*if(y == (uint64)1) ASSERT(B == 0, "twopmodq65 : B == 0");*/ return (y + FERMAT == 1ull && B == 0ull); } @@ -2250,7 +2250,7 @@ uint64 twopmodq65_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); qinv0 = (q0+q0+q0) ^ (uint64)2; qinv1 = (q1+q1+q1) ^ (uint64)2; @@ -2401,7 +2401,7 @@ uint64 twopmodq65_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6 pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); qinv0 = (q0+q0+q0) ^ (uint64)2; qinv1 = (q1+q1+q1) ^ (uint64)2; diff --git a/src/twopmodq100.c b/src/twopmodq100.c index e9dc1734..cc2eb64f 100755 --- a/src/twopmodq100.c +++ b/src/twopmodq100.c @@ -54,7 +54,7 @@ #endif uint64 *fq0[32],*fq1[32], *fqinv0[32],*fqinv1[32], *fx0[32],*fx1[32]; for(j = 0; j < 32; j++) { - ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!"); + ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!"); } #else const double crnd = 3.0*0x4000000*0x2000000; // Const used to emulate DNINT(x) and (when multiplied by BASE) 2^50 * DNINT(x*2^bpow2) @@ -73,12 +73,12 @@ double *fq0[32],*fq1[32], *fqinv0[32],*fqinv1[32], *fx0[32],*fx1[32], kdbl[32]; // AVX-512 Foundation lacks the needed DQ extensions, so use HLL to convert kvec entries to double: for(j = 0; j < 32; j++) { - ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!"); + ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!"); kdbl[j] = (double)k[j]; } - ASSERT(HERE, base == (double)(1ull<= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; @@ -126,9 +126,9 @@ // Alloc the local-memory block the #bytes multiplier has plenty of extra room built in, e.g. for debug-data-writes: #ifdef USE_AVX512_I - sc_arr = ALLOC_UINT64(sc_arr, 0x140*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_UINT64(sc_arr, 0x140*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (uint64 *)ALIGN_VEC_U64(sc_arr); // Force vec_u64-alignment - ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; #else @@ -157,9 +157,9 @@ #else // Default AVX-512 floating-point-FMA mode /***************************************************/ - sc_arr = ALLOC_DOUBLE(sc_arr, 0x140*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0x140*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (uint64 *)ALIGN_VEC_DBL(sc_arr); // Force vec_u64-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; #else @@ -190,7 +190,7 @@ /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0x140; #ifdef USE_AVX512_I @@ -237,7 +237,7 @@ #ifdef MUL_LOHI64_SUBROUTINE #error MUL_LOHI64_SUBROUTINE defined! #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq100_q32: p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq100_q32: p must be < 2^63!"); #ifdef USE_AVX512_I @@ -416,7 +416,7 @@ #endif // Init the modpow residue: - ASSERT(HERE, zshift < 48, "zshift out of expected range!"); + ASSERT(zshift < 48, "zshift out of expected range!"); dtmp = 1ull<= 0; j--) { #if FAC_DEBUG -if(dbg) printf("A: x = %20llu + 2^64* %20llu\n",x0.d0,x0.d1); +if(dbg) printf("A: x = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0,x0.d1); #endif #if THREE_OP128 /* Fused version of all 3 of the above function calls. Surprisingly, on Alpha this was significantly slower @@ -1206,8 +1206,8 @@ if(dbg) printf("A: x = %20llu + 2^64* %20llu\n",x0.d0,x0.d1); , x6, lo6, hi6 , x7, lo7, hi7); #if FAC_DEBUG -if(dbg) printf("B: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); -if(dbg) printf("B: h = %20llu + 2^64* %20llu\n",hi0.d0,hi0.d1); +if(dbg) printf("B: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); +if(dbg) printf("B: h = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",hi0.d0,hi0.d1); #endif /* For unknown reasons, the 8-operand version of MULL128 was slower than one-at-a-time. */ @@ -1242,7 +1242,7 @@ if(dbg) printf("B: h = %20llu + 2^64* %20llu\n",hi0.d0,hi0.d1); MULL128(lo7, qinv7, lo7); */ #if FAC_DEBUG -if(dbg) printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); +if(dbg) printf("C: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); #endif MULH128_q8( @@ -1275,7 +1275,7 @@ if(dbg) printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); MULH128(lo7, q7, lo7); */ #if FAC_DEBUG -if(dbg) printf("D: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); +if(dbg) printf("D: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); #endif #endif /* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */ @@ -1288,7 +1288,7 @@ if(dbg) printf("D: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); if(CMPULT128(hi6, lo6)) { SUB128(q6, lo6, lo6); ADD128(lo6, hi6, x6); } else { SUB128(hi6, lo6, x6); } if(CMPULT128(hi7, lo7)) { SUB128(q7, lo7, lo7); ADD128(lo7, hi7, x7); } else { SUB128(hi7, lo7, x7); } #if FAC_DEBUG -if(dbg) printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1); +if(dbg) printf("j = %2d, Res = %20" PRIu64 " + 2^64* %20" PRIu64,j,x0.d0,x0.d1); #endif if(TEST_BIT128(pshift, j)) @@ -1303,7 +1303,7 @@ if(dbg) printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1); if(CMPUGT128(x6, qhalf6)){ ADD128(x6, x6, x6); SUB128(x6, q6, x6); }else{ ADD128(x6, x6, x6); } if(CMPUGT128(x7, qhalf7)){ ADD128(x7, x7, x7); SUB128(x7, q7, x7); }else{ ADD128(x7, x7, x7); } #if FAC_DEBUG -if(dbg) printf(" *2 = %20llu + 2^64* %20llu",x0.d0,x0.d1); +if(dbg) printf(" *2 = %20" PRIu64 " + 2^64* %20" PRIu64,x0.d0,x0.d1); #endif } #if FAC_DEBUG @@ -1341,7 +1341,7 @@ if(dbg) printf("\n"); SUB128(x7, q7, x7); #if FAC_DEBUG -if(dbg) printf("x0 = %20llu + 2^64* %20llu\n",x0.d0, x0.d1); +if(dbg) printf("x0 = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0, x0.d1); #endif /* Only do the full 128-bit (Xj== 1) check if the bottom 64 bits of Xj == 1: */ diff --git a/src/twopmodq128_96.c b/src/twopmodq128_96.c index e8eefe0c..88d77c49 100755 --- a/src/twopmodq128_96.c +++ b/src/twopmodq128_96.c @@ -58,7 +58,7 @@ uint64 twopmodq128_96(uint64 p, uint64 k) if(dbg)printf("twopmodq128_96:\n"); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q.d0 = p+p; q.d1 = 0; #ifdef MUL_LOHI64_SUBROUTINE MUL_LOHI64(q.d0, k,&q.d0,&q.d1); @@ -66,7 +66,7 @@ if(dbg)printf("twopmodq128_96:\n"); MUL_LOHI64(q.d0, k, q.d0, q.d1); #endif q.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ - ASSERT(HERE, (q.d1 >> 32) == 0, "(q.d1 >> 32) != 0"); + ASSERT((q.d1 >> 32) == 0, "(q.d1 >> 32) != 0"); if(first_entry || p != psave) { @@ -105,7 +105,7 @@ if(dbg)printf("twopmodq128_96:\n"); */ /* q must be odd for Montgomery-style modmul to work: */ #if FAC_DEBUG - ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq128_96 : (q.d0 & (uint64)1) == 1"); + ASSERT((q.d0 & (uint64)1) == 1, "twopmodq128_96 : (q.d0 & (uint64)1) == 1"); #endif /* Init qinv = q. We're really only interested in the bottom 2 bits of q. */ qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2; qinv.d1 = (uint64)0; @@ -144,7 +144,7 @@ if(dbg)printf("twopmodq128_96:\n"); #endif #if FAC_DEBUG - ASSERT(HERE, qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128_96 : qinv.d1 == x.d1 && qinv.d0 == x.d0"); + ASSERT(qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq128_96 : qinv.d1 == x.d1 && qinv.d0 == x.d0"); #endif #if FAC_DEBUG @@ -182,7 +182,7 @@ if(dbg)printf("twopmodq128_96:\n"); if((pshift >> j) & (uint64)1) { #if FAC_DEBUG - ASSERT(HERE, CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)"); + ASSERT(CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)"); #endif ADD128(x,x,x); if(CMPULE128(q,x)) SUB128(x,q,x); @@ -223,7 +223,7 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_bu if((pshift >> j) & (uint64)1) { #if FAC_DEBUG - ASSERT(HERE, CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)"); + ASSERT(CMPULT128(x,q), "twopmodq128_96 : CMPULT128(x,q)"); #endif ADD128(x,x,x); /* Since we're using 128-bit arithmetic for the add, x+x cannot overflow. */ if(CMPULE128(q,x)) SUB128(x,q,x); @@ -298,7 +298,7 @@ if(dbg)printf("twopmodq128_96_q4:\n"); if(dbg) printf("start_index = %u\n", (uint32)start_index); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p; q0.d1 = q1.d1 = q2.d1 = q3.d1 = 0; #ifdef MUL_LOHI64_SUBROUTINE @@ -312,10 +312,10 @@ if(dbg)printf("twopmodq128_96_q4:\n"); MUL_LOHI64(q2.d0, k2, q2.d0, q2.d1); MUL_LOHI64(q3.d0, k3, q3.d0, q3.d1); #endif - ASSERT(HERE, (q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0"); - ASSERT(HERE, (q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0"); - ASSERT(HERE, (q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0"); - ASSERT(HERE, (q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0"); + ASSERT((q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0"); + ASSERT((q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0"); + ASSERT((q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0"); + ASSERT((q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0"); q0.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ q1.d0 += 1; @@ -324,10 +324,10 @@ if(dbg)printf("twopmodq128_96_q4:\n"); /* q must be odd for Montgomery-style modmul to work: */ #if FAC_DEBUG - ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q0.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q1.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q1.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q2.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q2.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q3.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q3.d0 & (uint64)1) == 1"); + ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q0.d0 & (uint64)1) == 1"); + ASSERT((q1.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q1.d0 & (uint64)1) == 1"); + ASSERT((q2.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q2.d0 & (uint64)1) == 1"); + ASSERT((q3.d0 & (uint64)1) == 1, "twopmodq128_96_q4 : (q3.d0 & (uint64)1) == 1"); #endif qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2; qinv0.d1 = (uint64)0; qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2; qinv1.d1 = (uint64)0; @@ -420,10 +420,10 @@ if(dbg)printf("twopmodq128_96_q4:\n"); if((pshift >> j) & (uint64)1) { #if FAC_DEBUG - ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)"); - ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)"); - ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)"); - ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)"); + ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)"); + ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)"); + ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)"); + ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)"); #endif ADD128(x0, x0, x0); ADD128(x1, x1, x1); @@ -468,10 +468,10 @@ if(dbg)printf("twopmodq128_96_q4:\n"); /* For unknown reasons, the 8-operand version of MULL128 was slower than one-at-a-time. */ #if FAC_DEBUG - ASSERT(HERE, CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(SQR_LO)"); - ASSERT(HERE, hi.d1 == 0 , "twopmodq128_96_q4 : hi.d1 != 0"); + ASSERT(CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(SQR_LO)"); + ASSERT(hi.d1 == 0 , "twopmodq128_96_q4 : hi.d1 != 0"); hi64 = hi.d0; - ASSERT(HERE, hi64 == hi0 , "twopmodq128_96_q4 : CMPEQ128(SQR_HI)"); + ASSERT(hi64 == hi0 , "twopmodq128_96_q4 : CMPEQ128(SQR_HI)"); x=lo0;y=qinv0; MULL128(x,y,lo); #endif @@ -484,11 +484,11 @@ if(dbg)printf("twopmodq128_96_q4:\n"); /* Need to be careful about the order of the 2 inputs here, as MULH128x96 assumes the 2nd input is the one which is < 2^96: */ #if FAC_DEBUG - ASSERT(HERE, CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULL128)"); + ASSERT(CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULL128)"); x=lo0;y=q0; MULH128(x,y,lo); MULH128(y,x,hi); - ASSERT(HERE, CMPEQ128(lo, hi), "twopmodq128_96_q4 : MULH(X,Y) != MULH(Y,X)"); + ASSERT(CMPEQ128(lo, hi), "twopmodq128_96_q4 : MULH(X,Y) != MULH(Y,X)"); #endif MULH128x96_q4( lo0, q0, lo0 @@ -503,7 +503,7 @@ if(dbg)printf("twopmodq128_96_q4:\n"); , q3, lo3, lo3); #endif #if FAC_DEBUG - ASSERT(HERE, CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULH())"); + ASSERT(CMPEQ128(lo,lo0), "twopmodq128_96_q4 : CMPEQ128(MULH())"); /* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */ if(lo.d1 != 0 || hi64 < lo.d0) { @@ -523,7 +523,7 @@ if(dbg)printf("twopmodq128_96_q4:\n"); if(lo2.d1 != 0 || hi2 < lo2.d0){ SUB128(q2, lo2, x2); x2.d0 += hi2; x2.d1 += (x2.d0 < hi2); } else { x2.d0 = hi2 - lo2.d0; x2.d1 = (uint64)0; } if(lo3.d1 != 0 || hi3 < lo3.d0){ SUB128(q3, lo3, x3); x3.d0 += hi3; x3.d1 += (x3.d0 < hi3); } else { x3.d0 = hi3 - lo3.d0; x3.d1 = (uint64)0; } #if FAC_DEBUG - ASSERT(HERE, CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())"); + ASSERT(CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())"); #endif #if FAC_DEBUG @@ -537,10 +537,10 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_bu if(CMPULE128(q0,x)) SUB128(x,q0,x); #endif #if FAC_DEBUG - ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)"); - ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)"); - ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)"); - ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)"); + ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q4 : CMPULT128(x0, q0)"); + ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q4 : CMPULT128(x1, q1)"); + ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q4 : CMPULT128(x2, q2)"); + ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q4 : CMPULT128(x3, q3)"); #endif ADD128(x0, x0, x0); ADD128(x1, x1, x1); @@ -557,7 +557,7 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint128_base10_char(char_bu } #if FAC_DEBUG if(dbg)printf("\n"); - ASSERT(HERE, CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())"); + ASSERT(CMPEQ128( x, x0), "twopmodq128_96_q4 : CMPEQ128(MULH())"); #endif } @@ -638,7 +638,7 @@ if(dbg)printf("twopmodq128_96_q8:\n"); pshift = ~pshift; } - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p; q0.d1 = q1.d1 = q2.d1 = q3.d1 = q4.d1 = q5.d1 = q6.d1 = q7.d1 = 0; #ifdef MUL_LOHI64_SUBROUTINE @@ -669,28 +669,28 @@ if(dbg)printf("twopmodq128_96_q8:\n"); q5.d0 += 1; q6.d0 += 1; q7.d0 += 1; - ASSERT(HERE, (q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0"); - ASSERT(HERE, (q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0"); - ASSERT(HERE, (q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0"); - ASSERT(HERE, (q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0"); - ASSERT(HERE, (q4.d1 >> 32) == 0, "(q4.d1 >> 32) != 0"); - ASSERT(HERE, (q5.d1 >> 32) == 0, "(q5.d1 >> 32) != 0"); - ASSERT(HERE, (q6.d1 >> 32) == 0, "(q6.d1 >> 32) != 0"); - ASSERT(HERE, (q7.d1 >> 32) == 0, "(q7.d1 >> 32) != 0"); + ASSERT((q0.d1 >> 32) == 0, "(q0.d1 >> 32) != 0"); + ASSERT((q1.d1 >> 32) == 0, "(q1.d1 >> 32) != 0"); + ASSERT((q2.d1 >> 32) == 0, "(q2.d1 >> 32) != 0"); + ASSERT((q3.d1 >> 32) == 0, "(q3.d1 >> 32) != 0"); + ASSERT((q4.d1 >> 32) == 0, "(q4.d1 >> 32) != 0"); + ASSERT((q5.d1 >> 32) == 0, "(q5.d1 >> 32) != 0"); + ASSERT((q6.d1 >> 32) == 0, "(q6.d1 >> 32) != 0"); + ASSERT((q7.d1 >> 32) == 0, "(q7.d1 >> 32) != 0"); /* ! Find modular inverse (mod 2^128) of q in preparation for modular multiply. */ /* q must be odd for Montgomery-style modmul to work: */ #if FAC_DEBUG - ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q0.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q1.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q1.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q2.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q2.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q3.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q3.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q4.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q4.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q5.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q5.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q6.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q6.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q7.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q7.d0 & (uint64)1) == 1"); + ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q0.d0 & (uint64)1) == 1"); + ASSERT((q1.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q1.d0 & (uint64)1) == 1"); + ASSERT((q2.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q2.d0 & (uint64)1) == 1"); + ASSERT((q3.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q3.d0 & (uint64)1) == 1"); + ASSERT((q4.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q4.d0 & (uint64)1) == 1"); + ASSERT((q5.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q5.d0 & (uint64)1) == 1"); + ASSERT((q6.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q6.d0 & (uint64)1) == 1"); + ASSERT((q7.d0 & (uint64)1) == 1, "twopmodq128_96_q8 : (q7.d0 & (uint64)1) == 1"); #endif qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2; qinv0.d1 = (uint64)0; qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2; qinv1.d1 = (uint64)0; @@ -801,14 +801,14 @@ if(dbg)printf("twopmodq128_96_q8:\n"); if((pshift >> j) & (uint64)1) { #if FAC_DEBUG - ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)"); - ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)"); - ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)"); - ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)"); - ASSERT(HERE, CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)"); - ASSERT(HERE, CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)"); - ASSERT(HERE, CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)"); - ASSERT(HERE, CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)"); + ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)"); + ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)"); + ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)"); + ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)"); + ASSERT(CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)"); + ASSERT(CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)"); + ASSERT(CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)"); + ASSERT(CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)"); #endif ADD128(x0, x0, x0); ADD128(x1, x1, x1); @@ -843,7 +843,7 @@ if(dbg)printf("twopmodq128_96_q8:\n"); for(j = start_index-2; j >= 0; j--) { #if FAC_DEBUG -if(dbg)printf("A: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); +if(dbg)printf("A: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); #endif /* Haven't gotten IA64 version of this working properly yet: SQR_LOHI_INPLACE128_96_q8( @@ -866,7 +866,7 @@ if(dbg)printf("A: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); , x6, lo6, hi6 , x7, lo7, hi7); #if FAC_DEBUG -if(dbg)printf("B: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); +if(dbg)printf("B: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); #endif /* For unknown reasons, the 8-operand version of MULL128 was slower than one-at-a-time. */ @@ -880,7 +880,7 @@ if(dbg)printf("B: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); , lo6, qinv6 , lo7, qinv7); #if FAC_DEBUG -if(dbg)printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); +if(dbg)printf("C: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); #endif #if(USE_128x96 > 0) /* Need to be careful about the order of the 2 inputs here, @@ -906,13 +906,13 @@ if(dbg)printf("C: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); , q7, lo7, lo7); #endif #if FAC_DEBUG -if(dbg)printf("D: l = %20llu + 2^64* %20llu\n",lo0.d0,lo0.d1); +if(dbg)printf("D: l = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",lo0.d0,lo0.d1); #endif /* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */ #if FAC_DEBUG -if(dbg)printf("On entry to (h> j) & (uint64)1) { #if FAC_DEBUG - ASSERT(HERE, CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)"); - ASSERT(HERE, CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)"); - ASSERT(HERE, CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)"); - ASSERT(HERE, CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)"); - ASSERT(HERE, CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)"); - ASSERT(HERE, CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)"); - ASSERT(HERE, CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)"); - ASSERT(HERE, CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)"); + ASSERT(CMPULT128(x0, q0), "twopmodq128_96_q8 : CMPULT128(x0, q0)"); + ASSERT(CMPULT128(x1, q1), "twopmodq128_96_q8 : CMPULT128(x1, q1)"); + ASSERT(CMPULT128(x2, q2), "twopmodq128_96_q8 : CMPULT128(x2, q2)"); + ASSERT(CMPULT128(x3, q3), "twopmodq128_96_q8 : CMPULT128(x3, q3)"); + ASSERT(CMPULT128(x4, q4), "twopmodq128_96_q8 : CMPULT128(x4, q4)"); + ASSERT(CMPULT128(x5, q5), "twopmodq128_96_q8 : CMPULT128(x5, q5)"); + ASSERT(CMPULT128(x6, q6), "twopmodq128_96_q8 : CMPULT128(x6, q6)"); + ASSERT(CMPULT128(x7, q7), "twopmodq128_96_q8 : CMPULT128(x7, q7)"); #endif ADD128(x0, x0, x0); ADD128(x1, x1, x1); @@ -956,7 +956,7 @@ if(dbg)printf("j = %2d, Res = %20llu + 2^64* %20llu",j,x0.d0,x0.d1); if(CMPULE128(q6, x6)) SUB128(x6, q6, x6); if(CMPULE128(q7, x7)) SUB128(x7, q7, x7); #if FAC_DEBUG -if(dbg)printf(" *2 = %20llu + 2^64* %20llu",x0.d0,x0.d1); +if(dbg)printf(" *2 = %20" PRIu64 " + 2^64* %20" PRIu64,x0.d0,x0.d1); #endif } #if FAC_DEBUG @@ -993,7 +993,7 @@ if(dbg)printf("\n"); SUB128(x7, q7, x7); #if FAC_DEBUG -if(dbg)printf("x0 = %20llu + 2^64* %20llu\n",x0.d0, x0.d1); +if(dbg)printf("x0 = %20" PRIu64 " + 2^64* %20" PRIu64 "\n",x0.d0, x0.d1); #endif /* Only do the full 128-bit (Xj== 1) check if the bottom 64 bits of Xj == 1: */ diff --git a/src/twopmodq160.c b/src/twopmodq160.c index 9965c0f6..72cc519e 100755 --- a/src/twopmodq160.c +++ b/src/twopmodq160.c @@ -64,7 +64,7 @@ uint64 twopmodq160(uint64 *p_in, uint64 k) #if FAC_DEBUG if(dbg)printf("twopmodq160:\n"); #endif - ASSERT(HERE, (p.d2 == 0) && (p.d1 >> 63) == 0, "p must be < 2^127!"); + ASSERT((p.d2 == 0) && (p.d1 >> 63) == 0, "p must be < 2^127!"); ADD128(p,p, q); q.d2 = mi64_mul_scalar((uint64 *)&q, k, (uint64 *)&q, 2); q.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ @@ -94,7 +94,7 @@ if(dbg)printf("twopmodq160:\n"); { j = leadz64(pshift.d2); /* Remember, pshift is stored in a 192-bit... */ #if FAC_DEBUG - ASSERT(HERE, j >= 32,"twopmodq160: j >= 32"); + ASSERT(j >= 32,"twopmodq160: j >= 32"); #endif /* Extract leftmost 8 bits of pshift (if > 159, use the leftmost 7) and subtract from 160: */ lead8 = (((pshift.d2<>(64-j))) >> 56); /* lead8 in [128, 255] */ @@ -149,7 +149,7 @@ if(dbg)printf("twopmodq160:\n"); */ /* q must be odd for Montgomery-style modmul to work: */ #if FAC_DEBUG - ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq160 : (q.d0 & (uint64)1) == 1"); + ASSERT((q.d0 & (uint64)1) == 1, "twopmodq160 : (q.d0 & (uint64)1) == 1"); #endif /* Init qinv = q. We're really only interested in the bottom 2 bits of q. */ @@ -172,14 +172,14 @@ if(dbg)printf("twopmodq160:\n"); MULL160(q, qinv, x); #if FAC_DEBUG MULL192(q, qinv, y); y.d2 &= 0x00000000ffffffff; - ASSERT(HERE, CMPEQ192(x, y), "twopmodq160: CMPEQ192(x, y)"); + ASSERT(CMPEQ192(x, y), "twopmodq160: CMPEQ192(x, y)"); SUB160 (TWO160, y, y); MULL192(y, qinv, y); y.d2 &= 0x00000000ffffffff; #endif SUB160 (TWO160, x, x); MULL160(qinv, x, qinv); #if FAC_DEBUG - ASSERT(HERE, CMPEQ192(qinv, y), "twopmodq160: CMPEQ192(qinv, y)"); + ASSERT(CMPEQ192(qinv, y), "twopmodq160: CMPEQ192(qinv, y)"); #endif MULL160(q, qinv, x); @@ -211,7 +211,7 @@ printf(""); #endif MULH160(q,lo,lo); #if FAC_DEBUG - ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: CMPEQ192(lo,y)"); + ASSERT(CMPEQ192(lo,y), "twopmodq160: CMPEQ192(lo,y)"); if(dbg) printf("q*lo/2^160 = %s\n", &char_buf[convert_uint192_base10_char(char_buf, lo)]); #endif @@ -224,7 +224,7 @@ printf(""); if(TEST_BIT160(pshift, j)) { #if FAC_DEBUG - ASSERT(HERE, CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)"); + ASSERT(CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT160(x, qhalf)){ ADD160(x, x, x); SUB160(x, q, x); }else{ ADD160(x, x, x); } @@ -248,20 +248,20 @@ if(dbg) printf("2x= %s\n", &char_buf[convert_uint192_base10_char(char_buf, x)]); SQR_LOHI192(x, y, z); LSHIFT_FAST192(z,32,z); z.d0 += (y.d2 >> 32); /* x^2/2^160 */ y.d2 &= 0x00000000ffffffff; /* x^2%2^160 */ - ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: SQR_LOHI160: CMPEQ192(lo,y)"); - ASSERT(HERE, CMPEQ192(hi,z), "twopmodq160: SQR_LOHI160: CMPEQ192(hi,z)"); + ASSERT(CMPEQ192(lo,y), "twopmodq160: SQR_LOHI160: CMPEQ192(lo,y)"); + ASSERT(CMPEQ192(hi,z), "twopmodq160: SQR_LOHI160: CMPEQ192(hi,z)"); y = lo; MULL192(y, qinv, y); y.d2 &= 0x00000000ffffffff; #endif MULL160(lo,qinv,lo); #if FAC_DEBUG - ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: MULL160: CMPEQ192(lo,y)"); + ASSERT(CMPEQ192(lo,y), "twopmodq160: MULL160: CMPEQ192(lo,y)"); LSHIFT_FAST192(lo,32,y); /* y = lo*2^32 */ MULH192(q,y,y); #endif MULH160(q,lo,lo); #if FAC_DEBUG - ASSERT(HERE, CMPEQ192(lo,y), "twopmodq160: MULH160: CMPEQ192(lo,y)"); + ASSERT(CMPEQ192(lo,y), "twopmodq160: MULH160: CMPEQ192(lo,y)"); #endif /* If h < l, then calculate q-l+h < q; otherwise calculate h-l. */ @@ -281,7 +281,7 @@ if(dbg) printf("2x= %s\n", &char_buf[convert_uint192_base10_char(char_buf, x)]); if(TEST_BIT160(pshift, j)) { #if FAC_DEBUG - ASSERT(HERE, CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)"); + ASSERT(CMPULT160(x,q), "twopmodq160 : CMPULT160(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT160(x, qhalf)){ ADD160(x, x, x); SUB160(x, q, x); }else{ ADD160(x, x, x); } diff --git a/src/twopmodq192.c b/src/twopmodq192.c index c6f7c3db..6da8910f 100755 --- a/src/twopmodq192.c +++ b/src/twopmodq192.c @@ -106,7 +106,7 @@ uint192 twopmmodq192(uint192 p, uint192 q) } #endif // Find inverse (mod 2^192) of q; q must be odd for Montgomery-style modmul to work: - ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmmodq192 : q must be odd for Montgomery-style modmul to work"); + ASSERT((q.d0 & (uint64)1) == 1, "twopmmodq192 : q must be odd for Montgomery-style modmul to work"); /* Init qinv = q. We're really only interested in the bottom 2 bits of q. */ qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2; qinv.d1 = qinv.d2 = 0ull; /* Compute qinv = q^-1 (mod R = 2^192) via Newton iteration qinv = qinv*(2 - q*qinv), starting with @@ -222,7 +222,7 @@ uint192 twopmodq192(uint192 p, uint192 q) mi64_div((uint64*)&qhalf,(uint64*)&p, 3,3, (uint64*)&x, (uint64*)&lo); // x contains k; lo = (q-1)/2 % p // dbg = (x.d0 == 488) && (x.d1 == 0 && x.d2 == 0); if(dbg) { - ASSERT(HERE, mi64_iszero((uint64*)&lo, 3), "k must divide (q-1)/2!"); + ASSERT(mi64_iszero((uint64*)&lo, 3), "k must divide (q-1)/2!"); printf("twopmodq192:\n"); } #endif @@ -309,7 +309,7 @@ uint192 twopmodq192(uint192 p, uint192 q) */ /* q must be odd for Montgomery-style modmul to work: */ #if FAC_DEBUG - ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq192 : q must be odd for Montgomery-style modmul to work!"); + ASSERT((q.d0 & (uint64)1) == 1, "twopmodq192 : q must be odd for Montgomery-style modmul to work!"); #endif /* Init qinv = q. We're really only interested in the bottom 2 bits of q. */ qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2; qinv.d2 = qinv.d1 = (uint64)0; @@ -392,7 +392,7 @@ q*qinv*lo = |000 (192-x bits) 000||-------------------------------------- q*qinv if(TEST_BIT192(pshift, j)) { #if FAC_DEBUG - ASSERT(HERE, CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)"); + ASSERT(CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT192(x, qhalf)){ ADD192(x, x, x); SUB192(x, q, x); }else{ ADD192(x, x, x); } @@ -432,7 +432,7 @@ q*qinv*lo = |000 (192-x bits) 000||-------------------------------------- q*qinv if(TEST_BIT192(pshift, j)) { #if FAC_DEBUG - ASSERT(HERE, CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)"); + ASSERT(CMPULT192(x,q), "twopmodq192 : CMPULT192(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT192(x, qhalf)){ ADD192(x, x, x); SUB192(x, q, x); }else{ ADD192(x, x, x); } @@ -481,10 +481,10 @@ uint64 twopmodq192_q4(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3) // Use x0 as tmp to hold 2*p: ADD192(p,p, x0); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!"); q0.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ q1.d0 += 1; q2.d0 += 1; @@ -647,7 +647,7 @@ uint64 twopmodq192_q4(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3) if(TEST_BIT192(pshift, j)) { #if FAC_DEBUG - ASSERT(HERE, CMPULT192(x0,q0), "twopmodq192_q4: CMPULT192(x,q)"); + ASSERT(CMPULT192(x0,q0), "twopmodq192_q4: CMPULT192(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT192(x0, qhalf0)){ ADD192(x0, x0, x0); SUB192(x0, q0, x0); }else{ ADD192(x0, x0, x0); } @@ -708,7 +708,7 @@ uint64 twopmodq192_q4(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3) if(TEST_BIT192(pshift, j)) { #if FAC_DEBUG - ASSERT(HERE, CMPULT192(x0,q0), "twopmodq192_q4 : CMPULT192(x,q)"); + ASSERT(CMPULT192(x0,q0), "twopmodq192_q4 : CMPULT192(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT192(x0, qhalf0)){ ADD192(x0, x0, x0); SUB192(x0, q0, x0); }else{ ADD192(x0, x0, x0); } @@ -774,7 +774,7 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr __vout.d2 = __lo + __cy;\ __cw = __hi + (__vout.d2 < __lo); /* carryout into cw */\ __lo = __vout.d2; /* bw0 = z[len-1]; */\ -/*if(__k==900) {printf("Macro: bw0 = %20llu, cw = %20llu, z` = %s\n", __lo,__cw,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\ +/*if(__k==900) {printf("Macro: bw0 = %20" PRIu64 ", cw = %20" PRIu64 ", z` = %s\n", __lo,__cw,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\ /* 2. compute low n words of z = z' + y via vector-vector add, any carryout of that gets added to a 2nd copy of cw, cz: */\ /* mi64_add(y,z,z, len): // z = z' + y */\ __vout.d0 = __vin.d0 + __vout.d0;\ @@ -789,9 +789,9 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr __cy = (__hi < __vin.d2);\ __vout.d2 += __hi;\ __cy += (__vout.d2 < __hi);\ -/*if(__k==900) {printf("Macro: __vout.d2 [out] = %20llu\n", __vout.d2);}*/\ +/*if(__k==900) {printf("Macro: __vout.d2 [out] = %20" PRIu64 "\n", __vout.d2);}*/\ __cz = __cw + __cy; /* cz = cw + mi64_add(y,z,z, len); // z = z' + y */\ -/*if(__k==900) {printf("Macro: cz = %20llu, z = %s\n", __cz,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\ +/*if(__k==900) {printf("Macro: cz = %20" PRIu64 ", z = %s\n", __cz,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\ \ /* 3. compute low n words of z >> (b-p), then separately shift in cz from the left, via (2^b*cz) >> (b-p) = (cz << p). */\ /* bw1 = mi64_shrl(z,z,nshift,len); // low n words of z >> (b-p); high 64 bits of off-shifted portion saved in bw1 */\ @@ -818,7 +818,7 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr /* Most-significant element gets zeros shifted in from the left: */\ __vout.d2 >>= __rembits;\ }\ -/*if(__k==900) {printf("Macro: bw1 = %20llu, z>> = %s\n", __hi,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\ +/*if(__k==900) {printf("Macro: bw1 = %20" PRIu64 ", z>> = %s\n", __hi,&char_buf[convert_uint192_base10_char(char_buf,__vout)]);}*/\ \ /* Check for borrow-on-subtract of to-be-off-shifted sections: */\ __bw = (__lo > __hi);\ @@ -869,7 +869,7 @@ mi64_mul_vector_hi_half for moduli q = 2.k.M(p) + 1, where M(p) is a Mersenne pr __cy = __vout.d2 - __bw;\ __bw = (__cy > __vout.d2);\ __vout.d2 = __cy;\ - ASSERT(HERE, !__bw, "bw != 0");\ + ASSERT(!__bw, "bw != 0");\ }\ } @@ -900,10 +900,10 @@ uint64 twopmodq192_q4_qmmp(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 p.d0 = p_in[0]; p.d1 = p_in[1]; p.d2 = p_in[2]; // Use x0 as tmp to hold 2*p: ADD192(p,p, x0); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!"); q0.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ q1.d0 += 1; @@ -923,7 +923,7 @@ uint64 twopmodq192_q4_qmmp(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 // Check that it's really a double-Mersenne: Adding one, right-shift by mmpsave = #bits give 1: mi64_add_scalar(p_in, 1ull, (uint64*)&x, 3); mi64_shrl((uint64*)&x, (uint64*)&x, mmpsave, 3,3); - --x.d0; ASSERT(HERE, mi64_iszero((uint64*)&x, 3), "MMp check failed!"); + --x.d0; ASSERT(mi64_iszero((uint64*)&x, 3), "MMp check failed!"); x.d0 = 192; x.d1 = x.d2 = 0; ADD192(p, x, pshift); if(pshift.d2) @@ -1082,10 +1082,10 @@ if(dbg) { #endif #if 1 - MULH192_QMMP(x0,mmpsave,k0,lo0,3); //MULH192(x0,q0,x); ASSERT(HERE, CMPEQ192(lo0, x), "MULH192_QMMP fail!"); - MULH192_QMMP(x1,mmpsave,k1,lo1,3); //MULH192(x1,q1,x); ASSERT(HERE, CMPEQ192(lo1, x), "MULH192_QMMP fail!"); - MULH192_QMMP(x2,mmpsave,k2,lo2,3); //MULH192(x2,q2,x); ASSERT(HERE, CMPEQ192(lo2, x), "MULH192_QMMP fail!"); - MULH192_QMMP(x3,mmpsave,k3,lo3,3); //MULH192(x3,q3,x); ASSERT(HERE, CMPEQ192(lo3, x), "MULH192_QMMP fail!"); + MULH192_QMMP(x0,mmpsave,k0,lo0,3); //MULH192(x0,q0,x); ASSERT(CMPEQ192(lo0, x), "MULH192_QMMP fail!"); + MULH192_QMMP(x1,mmpsave,k1,lo1,3); //MULH192(x1,q1,x); ASSERT(CMPEQ192(lo1, x), "MULH192_QMMP fail!"); + MULH192_QMMP(x2,mmpsave,k2,lo2,3); //MULH192(x2,q2,x); ASSERT(CMPEQ192(lo2, x), "MULH192_QMMP fail!"); + MULH192_QMMP(x3,mmpsave,k3,lo3,3); //MULH192(x3,q3,x); ASSERT(CMPEQ192(lo3, x), "MULH192_QMMP fail!"); #else MULH192(x0,q0,lo0); MULH192(x1,q1,lo1); @@ -1159,14 +1159,14 @@ uint64 twopmodq192_q8(uint64 *p_in, uint64 k0, uint64 k1, uint64 k2, uint64 k3, // Use x0 as tmp to hold 2*p: ADD192(p,p, x0); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k4, (uint64 *)&q4, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k5, (uint64 *)&q5, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k6, (uint64 *)&q6, 3), "q must be < 2^192!"); - ASSERT(HERE, !mi64_mul_scalar((uint64 *)&x0, k7, (uint64 *)&q7, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k0, (uint64 *)&q0, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k1, (uint64 *)&q1, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k2, (uint64 *)&q2, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k3, (uint64 *)&q3, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k4, (uint64 *)&q4, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k5, (uint64 *)&q5, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k6, (uint64 *)&q6, 3), "q must be < 2^192!"); + ASSERT(!mi64_mul_scalar((uint64 *)&x0, k7, (uint64 *)&q7, 3), "q must be < 2^192!"); q0.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ q1.d0 += 1; diff --git a/src/twopmodq256.c b/src/twopmodq256.c index cc7a79d6..b42d349e 100755 --- a/src/twopmodq256.c +++ b/src/twopmodq256.c @@ -136,7 +136,7 @@ uint256 twopmmodq256(uint256 p, uint256 q) } #endif // Find inverse (mod 2^256) of q; q must be odd for Montgomery-style modmul to work: - ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmmodq256 : q must be odd for Montgomery-style modmul to work"); + ASSERT((q.d0 & (uint64)1) == 1, "twopmmodq256 : q must be odd for Montgomery-style modmul to work"); /* Init qinv = q. We're really only interested in the bottom 2 bits of q. */ qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2; qinv.d1 = qinv.d2 = qinv.d3 = 0ull; /* Compute qinv = q^-1 (mod R = 2^256) via Newton iteration qinv = qinv*(2 - q*qinv), starting with @@ -342,7 +342,7 @@ uint256 twopmodq256(uint256 p, uint256 q) #endif /* Since zstart is a power of two < 2^256, use a streamlined code sequence for the first iteration: */ - ASSERT(HERE, start_index>=2, "twopmodq256 : start_index < 2!"); + ASSERT(start_index>=2, "twopmodq256 : start_index < 2!"); j = start_index-1; /* MULL256(zstart,qinv,lo) simply amounts to a left-shift of the bits of qinv: */ diff --git a/src/twopmodq64_test.c b/src/twopmodq64_test.c index dc4003e6..00461d36 100755 --- a/src/twopmodq64_test.c +++ b/src/twopmodq64_test.c @@ -113,7 +113,7 @@ x = x+x-q+FERMAT; // In the case of interest, x = (q+1)/2 < 2^63, so x + x cannot overflow. return (x==1); #else // ifndef __CUDA_ARCH__ - ASSERT(HERE, 0, "Device code being called in host mode!"); + ASSERT(0, "Device code being called in host mode!"); return 0; #endif } @@ -246,7 +246,7 @@ r += (x3+x3-q3+FERMAT == 1); return r; #else // ifndef __CUDA_ARCH__ - ASSERT(HERE, 0, "Device code being called in host mode!"); + ASSERT(0, "Device code being called in host mode!"); return 0; #endif } @@ -327,7 +327,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) { //int dbg = ( (p == (1ull<<32)) && ( (k0 == 2958ull) || (k1 == 2958ull) || (k2 == 2958ull) || (k3 == 2958ull) ) ); -//if(dbg) printf("Hit! k0-3 = %llu, %llu, %llu, %llu\n",k0, k1, k2, k3); +//if(dbg) printf("Hit! k0-3 = %" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "\n",k0, k1, k2, k3); int32 j; uint64 r = (p<<1), q0 = 1+r*k0, q1 = 1+r*k1, q2 = 1+r*k2, q3 = 1+r*k3 , qinv0, qinv1, qinv2, qinv3 @@ -348,7 +348,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 , "even modulus!"); // Compute 64-bit mod-inverses starting with 8-bits-good initializers: uint32 q32_0,q32_1,q32_2,q32_3, qi32_0,qi32_1,qi32_2,qi32_3; @@ -407,7 +407,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) x2 = y2; x3 = y3; } -//if(dbg) printf("x0 = %llu\n",x0); +//if(dbg) printf("x0 = %" PRIu64 "\n",x0); //printf("twopmodq64_q4 : x1 = %s\n", &str0[convert_uint64_base10_char(str0, x1)] ); //for(j = start_index-2; j >= 0; j--) { @@ -553,7 +553,7 @@ uint64 twopmodq64_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) //printf("twopmodq64_q4 : x1 = %s\n", &str0[convert_uint64_base10_char(str0, x1+x1-q1)] ); //exit(0); /*...Double and return. These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */ -//if(dbg) printf("xout = %llu\n",x0+x0-q0+FERMAT); +//if(dbg) printf("xout = %" PRIu64 "\n",x0+x0-q0+FERMAT); r = 0; if(x0+x0-q0+FERMAT == 1) r += 1; if(x1+x1-q1+FERMAT == 1) r += 2; @@ -586,7 +586,7 @@ uint64 twopmodq64_q8(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, uint6 pshift = ~pshift; /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); + ASSERT(q0 & 1 && q1 & 1 && q2 & 1 && q3 & 1 && q4 & 1 && q5 & 1 && q6 & 1 && q7 & 1 , "even modulus!"); // Compute 64-bit mod-inverses starting with 8-bits-good initializers: uint32 q32_0,q32_1,q32_2,q32_3,q32_4,q32_5,q32_6,q32_7, qi32_0,qi32_1,qi32_2,qi32_3,qi32_4,qi32_5,qi32_6,qi32_7; diff --git a/src/twopmodq80.c b/src/twopmodq80.c index 5e334ee4..8fcac633 100755 --- a/src/twopmodq80.c +++ b/src/twopmodq80.c @@ -94,11 +94,11 @@ The key 3-operation sequence here is as follows: int fidx; #if FAC_DEBUG if(dbg) { - printf("twopmodq78_3WORD_DOUBLE with p = %u, k = %llu, tid = %u\n",p,k,i); + printf("twopmodq78_3WORD_DOUBLE with p = %u, k = %" PRIu64 ", tid = %u\n",p,k,i); } #endif /* -if(k == 7946076362870052)printf("In twopmodq78_3WORD_DOUBLE with i = %u, p = %u, k = %llu\n",i,p,k); +if(k == 7946076362870052)printf("In twopmodq78_3WORD_DOUBLE with i = %u, p = %u, k = %" PRIu64 "\n",i,p,k); */ q.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE @@ -138,7 +138,7 @@ if(k == 7946076362870052)printf("In twopmodq78_3WORD_DOUBLE with i = %u, p = %u, MULH64(q.d0, qinv.d0, hi64); qinv.d1 = -qinv.d0*(q.d1*qinv.d0 + hi64); qinv.d1 &= 0x0000000000003fff; /* Only want the lower 14 bits here */ -// if(i == 0)printf("In twopmodq78_gpu with p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,k,zshift,start_index); +// if(i == 0)printf("In twopmodq78_gpu with p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,k,zshift,start_index); /* Convert qinv to floating form: */ CVT_UINT78_3WORD_DOUBLE(qinv, fqinv0,fqinv1,fqinv2); #if FAC_DEBUG @@ -284,12 +284,12 @@ z0 = 6272576; z12 = 898312175313603; z=z0+a*z12 <*** z0 is +1 too large *** SUB96(x,q,x); #if FAC_DEBUG if(dbg) { - printf("k = %llu: X_out = %u*2^64 + %llu\n", x.d1,x.d0); + printf("k = %" PRIu64 ": X_out = %u*2^64 + %" PRIu64 "\n", x.d1,x.d0); } #endif return (CMPEQ96(x, ONE96)); #else // ifndef __CUDA_ARCH__ - ASSERT(HERE, 0, "Device code being called in host mode!"); + ASSERT(0, "Device code being called in host mode!"); return 0; #endif } @@ -629,7 +629,7 @@ z0 = 6272576; z12 = 898312175313603; z=z0+a*z12 <*** z0 is +1 too large *** r += tmp3 << 3; return r; #else // ifndef __CUDA_ARCH__ - ASSERT(HERE, 0, "Device code being called in host mode!"); + ASSERT(0, "Device code being called in host mode!"); return 0; #endif } @@ -757,10 +757,10 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k) #if FAC_DEBUG if(dbg) { - printf("%s with p = %llu, k = %llu\n",func,p,k); + printf("%s with p = %" PRIu64 ", k = %" PRIu64 "\n",func,p,k); } #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!"); q.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -769,7 +769,7 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k) MUL_LOHI64(q.d0, k, q.d0, q.d1); #endif q.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ - ASSERT(HERE, (q.d1 >> 14) == 0, "twopmodq78 : (q.d1 >> 14) != 0"); + ASSERT((q.d1 >> 14) == 0, "twopmodq78 : (q.d1 >> 14) != 0"); /* Convert q to floating form: */ CVT_UINT78_3WORD_DOUBLE(q, fq0,fq1,fq2); @@ -812,7 +812,7 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k) pshift = ~pshift; #if FAC_DEBUG - if(dbg) printf("pshift = 0x%llX\n",pshift); + if(dbg) printf("pshift = %#" PRIX64 "\n",pshift); #endif } @@ -864,7 +864,7 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k) #endif qinv.d1 &= 0x0000000000003fff; /* Only want the lower 14 bits here */ -// printf("twopmodq78_3WORD_DOUBLE with p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", (uint32)p,(uint32)pshift,k,zshift,start_index); +// printf("twopmodq78_3WORD_DOUBLE with p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", (uint32)p,(uint32)pshift,k,zshift,start_index); /* Convert qinv to floating form: */ /* cvt_uint78_3word_double(qinv, &fqinv0,&fqinv1,&fqinv2); */ @@ -957,8 +957,8 @@ uint64 twopmodq78_3WORD_DOUBLE(uint64 p, uint64 k) CVT78_3WORD_DOUBLE_UINT96(fx0,fx1,fx2, x); if(~pshift != p+78) { - ASSERT(HERE, ~pshift > (p+78), "twopmodq80 : Only support pshift >= true value!"); - ASSERT(HERE,(~pshift - (p+78)) < 32, "twopmodq80 : Only support pshift-diff < 32!"); + ASSERT(~pshift > (p+78), "twopmodq80 : Only support pshift >= true value!"); + ASSERT((~pshift - (p+78)) < 32, "twopmodq80 : Only support pshift-diff < 32!"); qmul = fx0 + fx1*TWO26FLOAT; qmul += fx2*TWO26FLOAT*TWO26FLOAT; // Extra power of 2 is because in this flow we do not do the final 2*x-q step in the 'else' below: @@ -988,7 +988,7 @@ if(~pshift != p+78) { SUB96(x,lo,x); #if FAC_DEBUG if(dbg) { - printf("X_out[A] = %u*2^64 + %llu\n", x.d1,x.d0); + printf("X_out[A] = %u*2^64 + %" PRIu64 "\n", x.d1,x.d0); } #endif } else { @@ -997,7 +997,7 @@ if(~pshift != p+78) { SUB96(x,q,x); #if FAC_DEBUG if(dbg) { - printf("X_out[B] = %u*2^64 + %llu\n", x.d1,x.d0); + printf("X_out[B] = %u*2^64 + %" PRIu64 "\n", x.d1,x.d0); } #endif } @@ -1010,25 +1010,25 @@ if(~pshift != p+78) { #ifdef USE_IMCI512 uint64 twopmodq78_3WORD_DOUBLE_q2(uint64 p, uint64 k0, uint64 k1, int init_sse2, int thr_id) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q2 cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q2 cannot be called for k1om builds!"); return 0; } uint64 twopmodq78_3WORD_DOUBLE_q4(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3, int init_sse2, int thr_id) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q4 cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q4 cannot be called for k1om builds!"); return 0; } uint64 twopmodq78_3WORD_DOUBLE_q4_REF(uint64 p, uint64 k0, uint64 k1, uint64 k2, uint64 k3) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q4_REF cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q4_REF cannot be called for k1om builds!"); return 0; } uint64 twopmodq78_3WORD_DOUBLE_q8(uint64 p, uint64 k[], int init_sse2, int thr_id) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q8 cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q8 cannot be called for k1om builds!"); return 0; } uint64 twopmodq78_3WORD_DOUBLE_q16(uint64 p, uint64 k[], int init_sse2, int thr_id) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q16 cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q16 cannot be called for k1om builds!"); return 0; } uint64 twopmodq78_3WORD_DOUBLE_q32(uint64 p, uint64 k[], int init_sse2, int thr_id) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q32 cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q32 cannot be called for k1om builds!"); return 0; } uint64 twopmodq78_3WORD_DOUBLE_q64(uint64 p, uint64 k[], int init_sse2, int thr_id) { - ASSERT(HERE,0,"twopmodq78_3WORD_DOUBLE_q64 cannot be called for k1om builds!"); return 0; + ASSERT(0,"twopmodq78_3WORD_DOUBLE_q64 cannot be called for k1om builds!"); return 0; } #else @@ -1118,7 +1118,7 @@ if(~pshift != p+78) { double gq0,gq1,gq2, gqinv0,gqinv1,gqinv2, gx0,gx1,gx2, glo0,glo1,glo2, ghi0,ghi1,ghi2; // Note: In ||-init mode, *value* of init_sse2 to store #threads-to-init-for: if(init_sse2) { - ASSERT(HERE, init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!"); + ASSERT(init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!"); return 0; // In non-SIMD mode, ||-init call is a no-op } @@ -1126,7 +1126,7 @@ if(~pshift != p+78) { #if FAC_DEBUG if(dbg) { - printf("%s with p = %llu, k0 = %llu, k1 = %llu\n",func,p,k0,k1); + printf("%s with p = %" PRIu64 ", k0 = %" PRIu64 ", k1 = %" PRIu64 "\n",func,p,k0,k1); } #endif @@ -1162,7 +1162,7 @@ if(~pshift != p+78) { pshift = ~pshift; #if FAC_DEBUG - if(dbg) printf("pshift = 0x%llX\n",pshift); + if(dbg) printf("pshift = %#" PRIX64 "\n",pshift); #endif } @@ -1183,18 +1183,18 @@ if(~pshift != p+78) { #endif fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; } // Alloc the local-memory block: - sc_arr = ALLOC_DOUBLE(sc_arr, 0x2c*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0x2c*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr); // Force vec_dbl-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; two13i = sc_ptr + 0x18; @@ -1252,7 +1252,7 @@ if(~pshift != p+78) { /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0x2c; /* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 2 to span an SSE register: */ fq0 = sc_ptr + 0x00; gq0 = sc_ptr + 0x01; @@ -1271,13 +1271,13 @@ if(~pshift != p+78) { two26i = sc_ptr + 0x1c; sse2_rnd=sc_ptr + 0x1e; half = sc_ptr + 0x20; - // printf("Thr %d ONE96_PTR address = %llX; data.d0,d1 = %llu,%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1); - tmp = (vec_dbl*)sse2_rnd; ASSERT(HERE,(tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!"); + // printf("Thr %d ONE96_PTR address = %" PRIX64 "; data.d0,d1 = %" PRIu64 ",%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1); + tmp = (vec_dbl*)sse2_rnd; ASSERT((tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!"); #endif #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q2 : p must be < 2^63!"); q0.d0 = q1.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -1289,8 +1289,8 @@ if(~pshift != p+78) { #endif q0.d0 += 1; /* Since 2*p*k even, no need to check for overflow here */ q1.d0 += 1; - ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q2 : (q0.d1 >> 14) != 0"); - ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q2 : (q1.d1 >> 14) != 0"); + ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q2 : (q0.d1 >> 14) != 0"); + ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q2 : (q1.d1 >> 14) != 0"); /* Convert q to floating form: */ #ifdef USE_SSE2 @@ -1592,7 +1592,7 @@ if(~pshift != p+78) { #elif OS_BITS == 32 #error 32-bit OSes no longer supported for SIMD builds! - ASSERT(HERE, (uint32)(~pshift) == 0, "p+78 must be 32-bit here for 32-bit ASM support!"); + ASSERT((uint32)(~pshift) == 0, "p+78 must be 32-bit here for 32-bit ASM support!"); #else // The 64-bit version of the macro is timing-suboptimal because I used it as a testbed: // This 2-TF-input/4-xxm-register version serves as the basis for an 8-input version @@ -1976,7 +1976,7 @@ if(~pshift != p+78) { #endif int fidx,gidx,hidx,iidx; if(init_sse2) { - ASSERT(HERE, init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!"); + ASSERT(init_sse2 <= 1, "Multithreading currently only supported for SIMD builds!"); return 0; // In non-SIMD mode, ||-init call is a no-op } @@ -2021,18 +2021,18 @@ if(~pshift != p+78) { #endif fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; } // Alloc the local-memory block: - sc_arr = ALLOC_DOUBLE(sc_arr, 0x50*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0x50*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr); // Force vec_dbl-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; two13i = sc_ptr + 0x40; @@ -2095,7 +2095,7 @@ if(~pshift != p+78) { /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0x50; /* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 2 to span an SSE register: */ fq0 = sc_ptr + 0x00; gq0 = sc_ptr + 0x01; hq0 = sc_ptr + 0x02; iq0 = sc_ptr + 0x03; @@ -2119,12 +2119,12 @@ if(~pshift != p+78) { two26i = sc_ptr + 0x44; sse2_rnd=sc_ptr + 0x46; half = sc_ptr + 0x48; - tmp = (vec_dbl*)sse2_rnd; ASSERT(HERE,(tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!"); + tmp = (vec_dbl*)sse2_rnd; ASSERT((tmp->d0 == crnd) && (tmp->d1 == crnd), "Bad data at sse2_rnd address!"); #endif #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -2142,10 +2142,10 @@ if(~pshift != p+78) { q1.d0 += 1; q2.d0 += 1; q3.d0 += 1; - ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0"); - ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0"); - ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0"); - ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0"); + ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0"); + ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0"); + ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0"); + ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0"); q32_0 = (uint32)q0.d0; q32_1 = (uint32)q1.d0; @@ -3022,7 +3022,7 @@ if(~pshift != p+78) { double iq0,iq1,iq2, iqinv0,iqinv1,iqinv2, ix0,ix1,ix2, ilo0,ilo1,ilo2, ihi0,ihi1,ihi2; uint32 FERMAT = isPow2_64(p)<<1; // *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q4 : p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -3040,10 +3040,10 @@ if(~pshift != p+78) { q1.d0 += 1; q2.d0 += 1; q3.d0 += 1; - ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0"); - ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0"); - ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0"); - ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0"); + ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q4 : (q0.d1 >> 14) != 0"); + ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q4 : (q1.d1 >> 14) != 0"); + ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q4 : (q2.d1 >> 14) != 0"); + ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q4 : (q3.d1 >> 14) != 0"); /* Convert q to floating form: */ CVT_UINT78_3WORD_DOUBLE(q0, fq0,fq1,fq2); @@ -3319,7 +3319,7 @@ if(~pshift != p+78) { // No TF support on ARMv8 uint64 twopmodq78_3WORD_DOUBLE_q8(uint64 p, uint64 k[], int init_sse2, int thr_id) { - ASSERT(HERE,0,"No TF support on ARMv8!"); + ASSERT(0,"No TF support on ARMv8!"); } #elif defined(X64_ASM) && defined(USE_SSE2) @@ -3409,18 +3409,18 @@ if(~pshift != p+78) { #endif fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; } // Alloc the local-memory block - SSE2 needs 6 fewer double-slots than AVX (since only need one copy each of two13i,two26f,two26i), but use same AVX-alloc for both: - sc_arr = ALLOC_DOUBLE(sc_arr, 0x6c*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0x6c*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr); // Force vec_dbl-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; #ifdef USE_AVX @@ -3487,7 +3487,7 @@ if(~pshift != p+78) { /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0x6c; /* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */ aq0 = sc_ptr + 0x00; bq0 = sc_ptr + 0x01; cq0 = sc_ptr + 0x02; dq0 = sc_ptr + 0x03; eq0 = sc_ptr + 0x04; fq0 = sc_ptr + 0x05; gq0 = sc_ptr + 0x06; hq0 = sc_ptr + 0x07; @@ -3512,7 +3512,7 @@ if(~pshift != p+78) { #endif #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q8 : p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q8 : p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE #error MUL_LOHI64_SUBROUTINE defined! @@ -3534,14 +3534,14 @@ if(~pshift != p+78) { q5.d0 += 1; q6.d0 += 1; q7.d0 += 1; - ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0"); - ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0"); - ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0"); - ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0"); - ASSERT(HERE, (q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0"); - ASSERT(HERE, (q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0"); - ASSERT(HERE, (q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0"); - ASSERT(HERE, (q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0"); + ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0"); + ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0"); + ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0"); + ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0"); + ASSERT((q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0"); + ASSERT((q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0"); + ASSERT((q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0"); + ASSERT((q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0"); q32_0 = (uint32)q0.d0; q32_1 = (uint32)q1.d0; @@ -3995,9 +3995,9 @@ if(~pshift != p+78) { zshift0 <<= 1; zshift1 <<= 1; /* In [0,76]/[36,112]; Doubling the shift count here takes cares of the first SQR_LOHI */ pshift = ~pshift; /* 40 16-byte slots for floats, 16 for ints: */ - sc_arr = ALLOC_VEC_DBL(sc_arr, 40+32); ASSERT(HERE, sc_arr != 0x0, "ERROR: unable to allocate sc_arr!"); + sc_arr = ALLOC_VEC_DBL(sc_arr, 40+32); ASSERT(sc_arr != 0x0, "ERROR: unable to allocate sc_arr!"); sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 2 to span an SSE register. The bytewise address offsets of the left-column pointers (relative to base address fq0) are in the right comment-column: */ /* Byte offset */ @@ -4036,7 +4036,7 @@ if(~pshift != p+78) { /* Need both float and integer data to share same allocated chunk of memory, so can use a single base/offset scheme to manage both */ sm_ptr = (uint64*)(sc_ptr + 0x50); /* Contiguous offset w.r.to last float data above is 0x4a, but start ints at +0x50 for ease: */ - ASSERT(HERE, (uint32)sm_ptr == ((uint32)sc_ptr + 0x280), "sm_ptr not offset as expected!"); + ASSERT((uint32)sm_ptr == ((uint32)sc_ptr + 0x280), "sm_ptr not offset as expected!"); /* Remember, these are pointers-to-uint128, so need an increment of 2 to span a memory slot: */ /* Byte offsets: */ qptr4 = (uint96*)(sm_ptr + 0x00); qptr5 = (uint96*)(sm_ptr + 0x02); qptr6 = (uint96*)(sm_ptr + 0x04); qptr7 = (uint96*)(sm_ptr + 0x06); /* 0x280 */ qinv4 = (uint96*)(sm_ptr + 0x08); qinv5 = (uint96*)(sm_ptr + 0x0a); qinv6 = (uint96*)(sm_ptr + 0x0c); qinv7 = (uint96*)(sm_ptr + 0x0e); /* 0x2c0 */ @@ -4048,7 +4048,7 @@ if(~pshift != p+78) { ptr64 = (uint64*)ONE96_PTR; *ptr64++ = ONE96.d0; *ptr64-- = ONE96.d1; } /* first_entry */ - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p; MUL_LOHI64(q0.d0, k[0], q0.d0, q0.d1); MUL_LOHI64(q1.d0, k[1], q1.d0, q1.d1); @@ -4067,14 +4067,14 @@ if(~pshift != p+78) { q5.d0 += 1; q6.d0 += 1; q7.d0 += 1; - ASSERT(HERE, (q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0"); - ASSERT(HERE, (q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0"); - ASSERT(HERE, (q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0"); - ASSERT(HERE, (q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0"); - ASSERT(HERE, (q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0"); - ASSERT(HERE, (q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0"); - ASSERT(HERE, (q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0"); - ASSERT(HERE, (q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0"); + ASSERT((q0.d1 >> 14) == 0, "twopmodq78_q8 : (q0.d1 >> 14) != 0"); + ASSERT((q1.d1 >> 14) == 0, "twopmodq78_q8 : (q1.d1 >> 14) != 0"); + ASSERT((q2.d1 >> 14) == 0, "twopmodq78_q8 : (q2.d1 >> 14) != 0"); + ASSERT((q3.d1 >> 14) == 0, "twopmodq78_q8 : (q3.d1 >> 14) != 0"); + ASSERT((q4.d1 >> 14) == 0, "twopmodq78_q8 : (q4.d1 >> 14) != 0"); + ASSERT((q5.d1 >> 14) == 0, "twopmodq78_q8 : (q5.d1 >> 14) != 0"); + ASSERT((q6.d1 >> 14) == 0, "twopmodq78_q8 : (q6.d1 >> 14) != 0"); + ASSERT((q7.d1 >> 14) == 0, "twopmodq78_q8 : (q7.d1 >> 14) != 0"); /*****************************************************************************************************/ /*** From here onward, q0-3 get processed via 78-bit float-based modmul, q4-7 via 96-bit pure-int: ***/ @@ -4866,7 +4866,7 @@ if(~pshift != p+78) { *two13i, *two26f,*two26i, *two52f,*two52i; #if FAC_DEBUG - if(dbg) printf("%s with p = %llu, k[] = %llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", + if(dbg) printf("%s with p = %" PRIu64 ", k[] = %" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n", func,p,k[0x0],k[0x1],k[0x2],k[0x3],k[0x4],k[0x5],k[0x6],k[0x7],k[0x8],k[0x9],k[0xa],k[0xb],k[0xc],k[0xd],k[0xe],k[0xf]); #endif if(p != psave) @@ -4890,7 +4890,7 @@ if(~pshift != p+78) { // Result in [0,76], i.e. qinv << (zshift<<1) always has at least the leading bit set. pshift = ~pshift; #if FAC_DEBUG - if(dbg) printf("pshift = 0x%llX\n",pshift); + if(dbg) printf("pshift = %#" PRIX64 "\n",pshift); #endif } @@ -4909,18 +4909,18 @@ if(~pshift != p+78) { #endif fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; } // Alloc the local-memory block: - sc_arr = ALLOC_DOUBLE(sc_arr, 0xfc*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0xfc*max_threads + 4); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr); // Force vec_dbl-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; two13i = sc_ptr + 0xc0; @@ -4988,7 +4988,7 @@ if(~pshift != p+78) { /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0xfc; /* Remember, these are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */ fq0 [0] = sc_ptr + 0x00; @@ -5029,7 +5029,7 @@ if(~pshift != p+78) { #ifdef MUL_LOHI64_SUBROUTINE #error MUL_LOHI64_SUBROUTINE defined! #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q16: p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q16: p must be < 2^63!"); for(j = 0; j < 16; j++) { q[j].d0 = p+p; @@ -5192,7 +5192,7 @@ if(~pshift != p+78) { : "cc","memory","rax","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6", "xmm8","xmm9","xmm10", "xmm12","xmm13","xmm14" /* Clobbered registers */\ ); } else { - ASSERT(HERE,0,"zshift out of range!"); + ASSERT(0,"zshift out of range!"); } // VEC_DBL_INIT_4((vec_dbl*)fx0[0], dtmp); This wound up using zmm0 in some of my builds, so moved this init inside the iter1 macro below @@ -5237,7 +5237,7 @@ if(~pshift != p+78) { #if FAC_DEBUG if(dbg) { - printf("p = %llu, k0 = %llu, start_index0 = %u, initial shift = %u\n",p,k[0],start_index,zshift); + printf("p = %" PRIu64 ", k0 = %" PRIu64 ", start_index0 = %u, initial shift = %u\n",p,k[0],start_index,zshift); printf("On modpow-loop entry: start_index = %u,\n\tfx0-2 = %20.15f, %20.15f, %20.15f, %20.15f\n",start_index, *fx0[0],*fx1[0],*fx2[0]); } #endif @@ -5359,7 +5359,7 @@ if(~pshift != p+78) { #if FAC_DEBUG if(dbg) { - printf("xout_q16 = %llX\n",r); + printf("xout_q16 = %" PRIX64 "\n",r); exit(0); } #endif @@ -5386,7 +5386,7 @@ if(~pshift != p+78) { static uint64 psave = 0, pshift; static uint32 start_index, zshift, first_entry = TRUE; uint32 FERMAT = isPow2_64(p)<<1; // *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case - uint8* minv8_ptr = minv8; // Ptr to Table of precomputed byte-inverses def'd in mi64.h + const uint8* minv8_ptr = minv8; // Ptr to Table of precomputed byte-inverses def'd in mi64.h static int max_threads = 1; // Default local-array-init is for just a single thread ... caller can re-init for > 1 threads later, if desired. #ifdef USE_AVX512_I #error AVX-512 IFMA instruction extensions not yet supported! @@ -5402,7 +5402,7 @@ if(~pshift != p+78) { uint64 *fq0[32],*fq1[32],*fq2[32],*fqhi52[32], *fqinv0[32],*fqinv1[32],*fqinv2[32], *fx0[32],*fx1[32],*fx2[32], *mask_lo26,*mask_lo52; for(j = 0; j < 32; j++) { - ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!"); + ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!"); } #else static double *sc_arr = 0x0, *sc_ptr; @@ -5418,7 +5418,7 @@ if(~pshift != p+78) { kdbl[32]; // AVX-512 Foundation lacks the needed DQ extensions, so use HLL to convert kvec entries to double: for(j = 0; j < 32; j++) { - ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!"); + ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!"); kdbl[j] = (double)k[j]; } #endif @@ -5458,10 +5458,10 @@ if(~pshift != p+78) { #endif fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; @@ -5469,9 +5469,9 @@ if(~pshift != p+78) { // Alloc the local-memory block the #bytes multiplier has plenty of extra room built in, e.g. for debug-data-writes: #ifdef USE_AVX512_I - sc_arr = ALLOC_UINT64(sc_arr, 0x1c0*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_UINT64(sc_arr, 0x1c0*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (uint64 *)ALIGN_VEC_U64(sc_arr); // Force vec_u64-alignment - ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; mask_lo26 = sc_ptr + 0x180; @@ -5522,9 +5522,9 @@ if(~pshift != p+78) { #else // Default AVX-512 floating-point-FMA mode /***************************************************/ - sc_arr = ALLOC_DOUBLE(sc_arr, 0x1c0*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0x1c0*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (uint64 *)ALIGN_VEC_DBL(sc_arr); // Force vec_u64-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; two13i = sc_ptr + 0x180; @@ -5593,7 +5593,7 @@ if(~pshift != p+78) { /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0x1c0; #ifdef USE_AVX512_I @@ -5669,7 +5669,7 @@ if(~pshift != p+78) { #ifdef MUL_LOHI64_SUBROUTINE #error MUL_LOHI64_SUBROUTINE defined! #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q32: p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q32: p must be < 2^63!"); #ifdef USE_AVX512_I @@ -5987,7 +5987,7 @@ if(~pshift != p+78) { } else if(zshift < 78) { dtmp = 1<<(zshift-52); for(j = 0; j < 32; j += 8) { VEC_DBL_INIT_8((vec_dbl*)fx0[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx1[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx2[j],dtmp); } } else { - ASSERT(HERE,0,"zshift out of range!"); + ASSERT(0,"zshift out of range!"); } /*...x^2 mod q is returned in x. */ @@ -6133,7 +6133,7 @@ if(~pshift != p+78) { static uint64 psave = 0, pshift; static uint32 start_index, zshift, first_entry = TRUE; uint32 FERMAT = isPow2_64(p)<<1; // *2 is b/c need to add 2 to the usual Mers-mod residue in the Fermat case - uint8* minv8_ptr = minv8; // Ptr to Table of precomputed byte-inverses def'd in mi64.h + const uint8* minv8_ptr = minv8; // Ptr to Table of precomputed byte-inverses def'd in mi64.h static int max_threads = 1; // Default local-array-init is for just a single thread ... caller can re-init for > 1 threads later, if desired. #ifdef USE_AVX512_I #error AVX-512 IFMA instruction extensions not yet supported! @@ -6149,7 +6149,7 @@ if(~pshift != p+78) { uint64 *fq0[64],*fq1[64],*fq2[64],*fqhi52[64], *fqinv0[64],*fqinv1[64],*fqinv2[64], *fx0[64],*fx1[64],*fx2[64], *mask_lo26,*mask_lo52; for(j = 0; j < 64; j++) { - ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!"); + ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!"); } #else static double *sc_arr = 0x0, *sc_ptr; @@ -6165,7 +6165,7 @@ if(~pshift != p+78) { kdbl[64]; // AVX-512 Foundation lacks the needed DQ extensions, so use HLL to convert kvec entries to double: for(j = 0; j < 64; j++) { - ASSERT(HERE, (k[j] >> 52) == 0ull, "Ks must be < 2^52!"); + ASSERT((k[j] >> 52) == 0ull, "Ks must be < 2^52!"); kdbl[j] = (double)k[j]; } #endif @@ -6205,10 +6205,10 @@ if(~pshift != p+78) { #endif fprintf(stderr, "%s: Setting up for as many as %d threads...\n",func,max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sc_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sc_arr); sc_arr=0x0; @@ -6216,9 +6216,9 @@ if(~pshift != p+78) { // Alloc the local-memory block the #bytes multiplier has plenty of extra room built in, e.g. for debug-data-writes: #ifdef USE_AVX512_I - sc_arr = ALLOC_UINT64(sc_arr, 0x380*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_UINT64(sc_arr, 0x380*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (uint64 *)ALIGN_VEC_U64(sc_arr); // Force vec_u64-alignment - ASSERT(HERE, ((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uint32)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; mask_lo26 = sc_ptr + 0x300; @@ -6269,9 +6269,9 @@ if(~pshift != p+78) { #else // Default AVX-512 floating-point-FMA mode /***************************************************/ - sc_arr = ALLOC_DOUBLE(sc_arr, 0x380*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_DOUBLE(sc_arr, 0x380*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (uint64 *)ALIGN_VEC_DBL(sc_arr); // Force vec_u64-alignment - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); #ifdef MULTITHREAD __r0 = sc_ptr; two13i = sc_ptr + 0x300; @@ -6340,7 +6340,7 @@ if(~pshift != p+78) { /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); sc_ptr = __r0 + thr_id*0x380; #ifdef USE_AVX512_I @@ -6416,7 +6416,7 @@ if(~pshift != p+78) { #ifdef MUL_LOHI64_SUBROUTINE #error MUL_LOHI64_SUBROUTINE defined! #endif - ASSERT(HERE, (p >> 63) == 0, "twopmodq78_q64: p must be < 2^63!"); + ASSERT((p >> 63) == 0, "twopmodq78_q64: p must be < 2^63!"); #ifdef USE_AVX512_I @@ -6777,7 +6777,7 @@ if(~pshift != p+78) { } else if(zshift < 78) { dtmp = 1<<(zshift-52); for(j = 0; j < 64; j += 8) { VEC_DBL_INIT_8((vec_dbl*)fx0[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx1[j], 0.0); VEC_DBL_INIT_8((vec_dbl*)fx2[j],dtmp); } } else { - ASSERT(HERE,0,"zshift out of range!"); + ASSERT(0,"zshift out of range!"); } /*...x^2 mod q is returned in x. */ diff --git a/src/twopmodq80.h b/src/twopmodq80.h index 38f83019..f14b5de1 100755 --- a/src/twopmodq80.h +++ b/src/twopmodq80.h @@ -1866,7 +1866,7 @@ to balanced-digit floating-point form. Outputs have the following size ranges: uint64 __tmp64;\ int64 __itmp, __cy;\ \ - DBG_ASSERT(HERE, (__x.d1 >> 14) == 0, "Input > 78-bit limit!");\ + DBG_ASSERT((__x.d1 >> 14) == 0, "Input > 78-bit limit!");\ \ /* Digit 0: */\ __tmp64 = __x.d0;\ @@ -1892,7 +1892,7 @@ to balanced-digit floating-point form. Outputs have the following size ranges: /* No balanced-digit normalization of MSW: */\ __fword2 = (double)__tmp64;\ \ - DBG_ASSERT(HERE, __fword2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ + DBG_ASSERT(__fword2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ } /* Converts a 78-bit unsigned input __x (stored in balanced-digit @@ -1907,7 +1907,7 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized. if(__itmp < 0) /* If current digit < 0, add the base and set carry = -1 */\ {\ __itmp += TWO26FLOAT;\ - DBG_ASSERT(HERE, __itmp >= 0, "Normalized digit still < 0!");\ + DBG_ASSERT(__itmp >= 0, "Normalized digit still < 0!");\ __cy = -1;\ }\ else\ @@ -1921,7 +1921,7 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized. if(__itmp < 0)\ {\ __itmp += TWO26FLOAT;\ - DBG_ASSERT(HERE, __itmp >= 0, "Normalized digit still < 0!");\ + DBG_ASSERT(__itmp >= 0, "Normalized digit still < 0!");\ __cy = -1;\ }\ else\ @@ -1935,7 +1935,7 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized. if(__itmp < 0)\ {\ __itmp += TWO26FLOAT;\ - DBG_ASSERT(HERE, __itmp >= 0, "Normalized digit still < 0!");\ + DBG_ASSERT(__itmp >= 0, "Normalized digit still < 0!");\ __cy = -1;\ }\ else\ @@ -1945,8 +1945,8 @@ floating-point form) to a uint96. Assumes the FP input is properly normalized. __x.d0 += ((uint64)__itmp << 52);\ __x.d1 = ((uint64)__itmp >> 12) & 0x0000000000003fff; /* Only case where we really need the (uint64) cast */\ \ - DBG_ASSERT(HERE, (__x.d1 >> 14) == 0, "Output > 78-bit limit!");\ - DBG_ASSERT(HERE, __cy == 0, "Nonzero exit carry!");\ + DBG_ASSERT((__x.d1 >> 14) == 0, "Output > 78-bit limit!");\ + DBG_ASSERT(__cy == 0, "Nonzero exit carry!");\ } /* Takes a 78-bit unsigned input __x stored in balanced-digit floating-point form @@ -1968,8 +1968,8 @@ and renormalizes with respect to the balanced-digit base. /* Digit 2: */\ __x2 += __fcy;\ \ - DBG_ASSERT(HERE, __x2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ - DBG_ASSERT(HERE, __x2 >= 0 , "MSW < 0!");\ + DBG_ASSERT(__x2 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ + DBG_ASSERT(__x2 >= 0 , "MSW < 0!");\ } /* Takes a 156-bit unsigned input __x stored in balanced-digit floating-point form @@ -2013,10 +2013,10 @@ separately, we require the MSW of each to be nonnegative, i.e. we don't balance /* Digit 5: */\ __x5 += __fcy;\ \ - DBG_ASSERT(HERE, __x2 >= 0 , "_x2 < 0!");\ - DBG_ASSERT(HERE, __x2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __x5 >= 0 , "MSW < 0!");\ - DBG_ASSERT(HERE, __x5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ + DBG_ASSERT(__x2 >= 0 , "_x2 < 0!");\ + DBG_ASSERT(__x2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\ + DBG_ASSERT(__x5 >= 0 , "MSW < 0!");\ + DBG_ASSERT(__x5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ } /**********************************************************************************/ @@ -2069,9 +2069,9 @@ separately, we require the MSW of each to be nonnegative, i.e. we don't balance double __fcy;\ uint32 __itmp;\ \ - DBG_ASSERT(HERE, __fx0 < TWO26FLOAT, "x0 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __fx1 < TWO26FLOAT, "x1 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __fx2 < TWO26FLOAT, "x2 > TWO26FLOAT");\ + DBG_ASSERT(__fx0 < TWO26FLOAT, "x0 > TWO26FLOAT");\ + DBG_ASSERT(__fx1 < TWO26FLOAT, "x1 > TWO26FLOAT");\ + DBG_ASSERT(__fx2 < TWO26FLOAT, "x2 > TWO26FLOAT");\ \ /* Digit 0: */\ __fprod0 = __fx0*__fx0;\ @@ -2105,10 +2105,10 @@ separately, we require the MSW of each to be nonnegative, i.e. we don't balance /* Digit 5: */\ __fprod5 = __fcy;\ \ - DBG_ASSERT(HERE, __fprod2 >= 0 , "_x2 < 0!");\ - DBG_ASSERT(HERE, __fprod2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __fprod5 >= 0 , "MSW < 0!");\ - DBG_ASSERT(HERE, __fprod5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ + DBG_ASSERT(__fprod2 >= 0 , "_x2 < 0!");\ + DBG_ASSERT(__fprod2 <= TWO26FLOAT, "_x2 > TWO26FLOAT");\ + DBG_ASSERT(__fprod5 >= 0 , "MSW < 0!");\ + DBG_ASSERT(__fprod5 <= TWO26FLOAT, "MSW > TWO26FLOAT");\ } #ifdef __CUDACC__ @@ -2483,7 +2483,7 @@ double __mo52 = fma(__y1,TWO26FLOAT,__y0); /* mo52 = y0 + y1*2^26 */\ __itmp = (__lo2 < 0);\ __lo2 += (double)(__itmp << 26);\ /* Require output to be nonnegative, so leave MSW unbalanced: */\ - DBG_ASSERT(HERE, __lo2 >= 0, "MSW < 0!");\ + DBG_ASSERT(__lo2 >= 0, "MSW < 0!");\ } #endif @@ -2564,13 +2564,13 @@ we code so that any or all of __X, __Y and __LO may have the same addresses. double __fcy, __tmp, __prod3, __prod4;\ uint32 __itmp;\ \ - DBG_ASSERT(HERE, __x0 < TWO26FLOAT, "x0 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __x1 < TWO26FLOAT, "x1 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __x2 < TWO26FLOAT, "x2 > TWO26FLOAT");\ + DBG_ASSERT(__x0 < TWO26FLOAT, "x0 > TWO26FLOAT");\ + DBG_ASSERT(__x1 < TWO26FLOAT, "x1 > TWO26FLOAT");\ + DBG_ASSERT(__x2 < TWO26FLOAT, "x2 > TWO26FLOAT");\ \ - DBG_ASSERT(HERE, __y0 < TWO26FLOAT, "y0 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __y1 < TWO26FLOAT, "y1 > TWO26FLOAT");\ - DBG_ASSERT(HERE, __y2 < TWO26FLOAT, "y2 > TWO26FLOAT");\ + DBG_ASSERT(__y0 < TWO26FLOAT, "y0 > TWO26FLOAT");\ + DBG_ASSERT(__y1 < TWO26FLOAT, "y1 > TWO26FLOAT");\ + DBG_ASSERT(__y2 < TWO26FLOAT, "y2 > TWO26FLOAT");\ \ /* Digit 0: */\ __tmp = __x0*__y0;\ @@ -2610,7 +2610,7 @@ we code so that any or all of __X, __Y and __LO may have the same addresses. /* Digit 5: */\ __hi2 = __fcy;\ \ - DBG_ASSERT(HERE, __hi2 >= 0, "MSW < 0!");\ + DBG_ASSERT(__hi2 >= 0, "MSW < 0!");\ } #ifdef __CUDACC__ @@ -2892,7 +2892,7 @@ out128.d0 = (uint64)__fcy; out128.d1 = (uint64)0ull;\ ADD128(j128,out128,out128);\ lo26 = out128.d0 & 0x0000000003FFFFFFull;\ RSHIFT128(out128, 26, out128);\ -fprintf(stderr,"exact<52:77>, <78:129> = %20llu, %20llu\n",out128.d0,out128.d1);\ +fprintf(stderr,"exact<52:77>, <78:129> = %20" PRIu64 ", %20" PRIu64 "\n",out128.d0,out128.d1);\ } */ #define MULH78_3WORD_DOUBLE_q4(\ @@ -2935,7 +2935,7 @@ out128.d0 = (uint64)__fcy; out128.d1 = (uint64)0ull;\ ADD128(j128,out128,out128);\ lo26 = out128.d0 & 0x0000000003FFFFFFull;\ RSHIFT128(out128, 26, out128);\ -fprintf(stderr,"exact<52:77>, <78:129> = %20llu, %20llu\n",out128.d0,out128.d1);\ +fprintf(stderr,"exact<52:77>, <78:129> = %20" PRIu64 ", %20" PRIu64 "\n",out128.d0,out128.d1);\ __ftmp = __fx52*__fy2+__fx2*__fy52+__fcy;\ __gtmp = __gx52*__gy2+__gx2*__gy52+__gcy;\ __htmp = __hx52*__hy2+__hx2*__hy52+__hcy;\ diff --git a/src/twopmodq96.c b/src/twopmodq96.c index 7c1a62d4..19bb15b8 100755 --- a/src/twopmodq96.c +++ b/src/twopmodq96.c @@ -118,7 +118,7 @@ /*...Double and return. These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */ -//(p == 18276023 && k == 760542841672ull)printf("q = [%u,%llu], x = [%u,%llu]\n",q.d1,q.d0, x.d1,x.d0); +//(p == 18276023 && k == 760542841672ull)printf("q = [%u,%" PRIu64 "], x = [%u,%" PRIu64 "]\n",q.d1,q.d0, x.d1,x.d0); #if 1 // I should read my own comments ... since x = (q+1)/2 implies divisibility can replace this... ADD96(x,x,x); /* In the case of interest, x = (q+1)/2 < 2^95, so x + x cannot overflow. */ q.d0 -= FERMAT; @@ -129,7 +129,7 @@ return (x.d1 == qhalf.d1 && x.d0 == (qhalf.d0+1)); #endif #else // ifndef __CUDA_ARCH__ - ASSERT(HERE, 0, "Device code being called in host mode!"); + ASSERT(0, "Device code being called in host mode!"); return 0; #endif } @@ -229,7 +229,7 @@ uint96 twopmodq96(uint64 p, uint64 k) #ifdef FAC_DEBUG if(dbg)printf("twopmodq96:\n"); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -246,7 +246,7 @@ if(dbg)printf("twopmodq96:\n"); */ /* q must be odd for Montgomery-style modmul to work: */ #ifdef FAC_DEBUG - ASSERT(HERE, (q.d0 & (uint64)1) == 1, "twopmodq96 : (q.d0 & (uint64)1) == 1"); + ASSERT((q.d0 & (uint64)1) == 1, "twopmodq96 : (q.d0 & (uint64)1) == 1"); #endif /* Init qinv = q. We're really only interested in the bottom 2 bits of q. */ qinv.d0 = (q.d0 + q.d0 + q.d0) ^ (uint64)2; qinv.d1 = (uint64)0; @@ -287,7 +287,7 @@ if(dbg)printf("twopmodq96:\n"); MULL96(q, qinv, x); MULL128(q, qinv, y); SUB96 (TWO96, x, x); SUB128 (TWO128, y, y); MULL96(qinv, x, x); MULL128(qinv, y, y); - ASSERT(HERE, x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0, "x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0"); + ASSERT(x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0, "x.d1 == (y.d1 & 0x00000000ffffffff) && x.d0 == y.d0"); #endif /* qinv has 96 bits, but only the upper 32 get modified here. */ #ifdef MUL_LOHI64_SUBROUTINE @@ -299,7 +299,7 @@ if(dbg)printf("twopmodq96:\n"); qinv.d1 &= 0x00000000ffffffff; /* Only want the lower 32 bits here */ #ifdef FAC_DEBUG - ASSERT(HERE, qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq96 : qinv.d1 == x.d1 && qinv.d0 == x.d0"); + ASSERT(qinv.d1 == x.d1 && qinv.d0 == x.d0, "twopmodq96 : qinv.d1 == x.d1 && qinv.d0 == x.d0"); if(dbg) printf("q = %s\n", &char_buf[convert_uint96_base10_char(char_buf, q )]); if(dbg) printf("qinv = %s\n", &char_buf[convert_uint96_base10_char(char_buf, qinv)]); #endif @@ -333,7 +333,7 @@ if(dbg)printf("twopmodq96:\n"); if((pshift >> j) & (uint64)1) { #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)"); + ASSERT(CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT96(x, qhalf)){ ADD96(x, x, x); SUB96(x, q, x); }else{ ADD96(x, x, x); } @@ -369,15 +369,15 @@ if(dbg)printf("twopmodq96:\n"); SUB96(q, lo, lo); ADD96(lo, hi, x); #ifdef FAC_DEBUG - if(dbg) printf("q-l = %10u, %20llu\n", lo.d1, lo.d0); - if(dbg) printf("q-l+h = %10u, %20llu\n", x.d1, x.d0); + if(dbg) printf("q-l = %10u, %20" PRIu64 "\n", lo.d1, lo.d0); + if(dbg) printf("q-l+h = %10u, %20" PRIu64 "\n", x.d1, x.d0); #endif } else { SUB96(hi, lo, x); #ifdef FAC_DEBUG - if(dbg) printf("q=h-l = %10u, %20llu\n", x.d1, x.d0); + if(dbg) printf("q=h-l = %10u, %20" PRIu64 "\n", x.d1, x.d0); #endif } @@ -388,7 +388,7 @@ if(dbg)printf("j = %2d, x = %s",j, &char_buf[convert_uint96_base10_char(char_buf if((pshift >> j) & (uint64)1) { #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)"); + ASSERT(CMPULT96(x, q), "twopmodq96 : CMPULT96(x,q)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT96(x, qhalf)){ ADD96(x, x, x); SUB96(x, q, x); }else{ ADD96(x, x, x); } @@ -479,24 +479,24 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) #endif fprintf(stderr, "twopmodq96_q4: Setting up for as many as %d threads...\n",max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sm_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sm_arr); sm_arr=0x0; } // Alloc the local-memory block: - sm_arr = ALLOC_UINT64(sm_arr, 0x32*max_threads); ASSERT(HERE, sm_arr != 0x0, "ERROR: unable to allocate sm_arr!"); - sm_ptr = (uint64*)ALIGN_UINT64(sm_arr); ASSERT(HERE, ((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!"); + sm_arr = ALLOC_UINT64(sm_arr, 0x32*max_threads); ASSERT(sm_arr != 0x0, "ERROR: unable to allocate sm_arr!"); + sm_ptr = (uint64*)ALIGN_UINT64(sm_arr); ASSERT(((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!"); #ifdef MULTITHREAD __r0 = (uint96 *)sm_ptr; ptr64 = sm_ptr + 0x30; // *** PTR-OFFSET IN TERMS OF UINT64 HERE *** for(j = 0; j < max_threads; ++j) { // These data fixed within each thread's local store: *ptr64++ = ONE96.d0; *ptr64-- = ONE96.d1; - // printf("INIT: Thr %d ONE96_PTR address = %llX; data.d0,d1 = %llu,%u\n",thr_id,(uint64)ptr64,((uint96 *)ptr64)->d0,((uint96 *)ptr64)->d1); + // printf("INIT: Thr %d ONE96_PTR address = %" PRIX64 "; data.d0,d1 = %" PRIu64 ",%u\n",thr_id,(uint64)ptr64,((uint96 *)ptr64)->d0,((uint96 *)ptr64)->d1); ptr64 += 0x32; // Move on to next thread's local store } #else @@ -515,7 +515,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); ptr64 = ((uint64*)__r0) + thr_id*0x32; qptr0 = (uint96*)(ptr64 + 0x00); qptr1 = (uint96*)(ptr64 + 0x02); qptr2 = (uint96*)(ptr64 + 0x04); qptr3 = (uint96*)(ptr64 + 0x06); qinv0 = (uint96*)(ptr64 + 0x08); qinv1 = (uint96*)(ptr64 + 0x0a); qinv2 = (uint96*)(ptr64 + 0x0c); qinv3 = (uint96*)(ptr64 + 0x0e); @@ -524,8 +524,8 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) qhalf0 = (uint96*)(ptr64 + 0x20); qhalf1 = (uint96*)(ptr64 + 0x22); qhalf2 = (uint96*)(ptr64 + 0x24); qhalf3 = (uint96*)(ptr64 + 0x26); hi0 = (uint96*)(ptr64 + 0x28); hi1 = (uint96*)(ptr64 + 0x2a); hi2 = (uint96*)(ptr64 + 0x2c); hi3 = (uint96*)(ptr64 + 0x2e); ONE96_PTR = (uint96*)(ptr64 + 0x30); - // printf("Thr %d ONE96_PTR address = %llX; data.d0,d1 = %llu,%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1); - ASSERT(HERE,(ONE96_PTR->d0 == ONE96.d0) && (ONE96_PTR->d1 == ONE96.d1), "Bad data at ONE96_PTR address!"); + // printf("Thr %d ONE96_PTR address = %" PRIX64 "; data.d0,d1 = %" PRIu64 ",%u\n",thr_id,(uint64)ONE96_PTR,ONE96_PTR->d0,ONE96_PTR->d1); + ASSERT((ONE96_PTR->d0 == ONE96.d0) && (ONE96_PTR->d1 == ONE96.d1), "Bad data at ONE96_PTR address!"); #endif pshift = p + 96; @@ -544,7 +544,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) if(dbg) printf("twopmodq96_q4: leadb = %u\n",leadb); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p; MUL_LOHI64(q0.d0, k0, q0.d0, q0.d1); MUL_LOHI64(q1.d0, k1, q1.d0, q1.d1); @@ -570,7 +570,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) ! Find modular inverse (mod 2^96) of q in preparation for modular multiply. */ /* q must be odd for Montgomery-style modmul to work: */ - ASSERT(HERE, (q0.d0 & 1) && (q1.d0 & 1) && (q2.d0 & 1) && (q3.d0 & 1), "even modulus!"); + ASSERT((q0.d0 & 1) && (q1.d0 & 1) && (q2.d0 & 1) && (q3.d0 & 1), "even modulus!"); qinv0->d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2; qinv0->d1 = (uint64)0; qinv1->d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2; qinv1->d1 = (uint64)0; qinv2->d0 = (q2.d0 + q2.d0 + q2.d0) ^ (uint64)2; qinv2->d1 = (uint64)0; @@ -666,7 +666,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) if(CMPUGT96_PTR(x2, qhalf2)){ ADD96_PTR(x2, x2, x2); SUB96_PTR(x2, qptr2, x2); }else{ ADD96_PTR(x2, x2, x2); } if(CMPUGT96_PTR(x3, qhalf3)){ ADD96_PTR(x3, x3, x3); SUB96_PTR(x3, qptr3, x3); }else{ ADD96_PTR(x3, x3, x3); } #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96_PTR(x0, qptr0), "twopmodq96_q4 : CMPULT96(x0,q0)"); + ASSERT(CMPULT96_PTR(x0, qptr0), "twopmodq96_q4 : CMPULT96(x0,q0)"); #endif } @@ -1107,10 +1107,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) zshift <<= 1; /* Doubling the shift count here takes cares of the first SQR_LOHI */ pshift = ~pshift; #ifdef FAC_DEBUG - if(dbg) printf("twopmodq96_q4: leadb = %u, pshift = %llu\n",leadb,pshift); + if(dbg) printf("twopmodq96_q4: leadb = %u, pshift = %" PRIu64 "\n",leadb,pshift); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -1155,7 +1155,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) */ /* q must be odd for Montgomery-style modmul to work: */ #ifdef FAC_DEBUG - ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq96_q4 : (q0.d0 & (uint64)1) == 1"); + ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq96_q4 : (q0.d0 & (uint64)1) == 1"); #endif qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2; qinv0.d1 = (uint64)0; qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2; qinv1.d1 = (uint64)0; @@ -1274,10 +1274,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) if(CMPUGT96(x3, qhalf3)){ ADD96(x3, x3, x3); SUB96(x3, q3, x3); }else{ ADD96(x3, x3, x3); } #ifdef FAC_DEBUG // if(CMPULT96(q0, x0)) { sprintf(char_buf, "twopmodq96_q4 : (x0 = %s) >= (q0 = %s)", &str0[convert_uint96_base10_char(str0, x0)], &str1[convert_uint96_base10_char(str1, q0)] ); DBG_WARN(HERE, char_buf, STATFILE, !restart); } - ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)"); - ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)"); - ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)"); - ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)"); + ASSERT(CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)"); + ASSERT(CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)"); + ASSERT(CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)"); + ASSERT(CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)"); if(dbg) { printf("x0 = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x0)]); printf("x1 = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x1)]); @@ -1396,10 +1396,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) if((pshift >> j) & (uint64)1) { #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)"); - ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)"); - ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)"); - ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)"); + ASSERT(CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)"); + ASSERT(CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)"); + ASSERT(CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)"); + ASSERT(CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); } @@ -1458,10 +1458,10 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) if((pshift >> j) & (uint64)1) { #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)"); - ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)"); - ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)"); - ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)"); + ASSERT(CMPULT96(x0, q0), "twopmodq96_q4 : CMPULT96(x0,q0)"); + ASSERT(CMPULT96(x1, q1), "twopmodq96_q4 : CMPULT96(x1,q1)"); + ASSERT(CMPULT96(x2, q2), "twopmodq96_q4 : CMPULT96(x2,q2)"); + ASSERT(CMPULT96(x3, q3), "twopmodq96_q4 : CMPULT96(x3,q3)"); #endif if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); } if(CMPUGT96(x1, qhalf1)){ ADD96(x1, x1, x1); SUB96(x1, q1, x1); }else{ ADD96(x1, x1, x1); } @@ -1582,24 +1582,24 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) #endif fprintf(stderr, "twopmodq96_q4: Setting up for as many as %d threads...\n",max_threads); #ifndef COMPILER_TYPE_GCC - ASSERT(HERE, NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); + ASSERT(NTHREADS == 1, "Multithreading currently only supported for GCC builds!"); #endif - ASSERT(HERE, max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); - ASSERT(HERE, thr_id == -1, "Init-mode call must be outside of any multithreading!"); + ASSERT(max_threads >= NTHREADS, "Multithreading requires max_threads >= NTHREADS!"); + ASSERT(thr_id == -1, "Init-mode call must be outside of any multithreading!"); } if(sm_arr != 0x0) { // Have previously-malloc'ed local storage (e.g. unthreaded call to the function) free((void *)sm_arr); sm_arr=0x0; } // Alloc the local-memory block - use uint64 allooc/align macros here, but underlying data are all uint96 = [uint64,uint32] pairs: - sm_arr = (uint96*)ALLOC_UINT64(sm_arr, 0x4a*max_threads); ASSERT(HERE, sm_arr != 0x0, "ERROR: unable to allocate sm_arr!"); - sm_ptr = (uint96*)ALIGN_UINT64(sm_arr); ASSERT(HERE, ((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!"); + sm_arr = (uint96*)ALLOC_UINT64(sm_arr, 0x4a*max_threads); ASSERT(sm_arr != 0x0, "ERROR: unable to allocate sm_arr!"); + sm_ptr = (uint96*)ALIGN_UINT64(sm_arr); ASSERT(((uint64)sm_ptr & 0xf) == 0, "sm_ptr not 16-byte aligned!"); #ifdef MULTITHREAD __r0 = (uint96 *)sm_ptr; ptr32 = (uint32*)(sm_ptr + 0x30); // perm_mask ptr to permute-index register containing dwords 0-7 = [0,7,1,7,2,7,3,7] for(j = 0; j < max_threads; ++j) { // These data fixed within each thread's local store: *ptr32 = 0; *(ptr32+1) = 7; *(ptr32+1) = 1; *(ptr32+1) = 7; *(ptr32+1) = 2; *(ptr32+1) = 7; *(ptr32+1) = 3; *(ptr32+1) = 7; - // printf("INIT: Thr %d perm_mask address = %llX; data.d0-7 = %llu,%u\n",thr_id,(uint64)ptr96,((uint96 *)ptr96)->d0,((uint96 *)ptr96)->d1); + // printf("INIT: Thr %d perm_mask address = %" PRIX64 "; data.d0-7 = %" PRIu64 ",%u\n",thr_id,(uint64)ptr96,((uint96 *)ptr96)->d0,((uint96 *)ptr96)->d1); ptr32 += 3 * 0x4a; // Move on to next thread's local store; 3x accounts for size differntial between uint32 and uint96 } #else @@ -1628,7 +1628,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) /* If multithreaded, set the local-store pointers needed for the current thread; */ #ifdef MULTITHREAD - ASSERT(HERE, (uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); + ASSERT((uint32)thr_id < (uint32)max_threads, "Bad thread ID!"); ptr96 = ((uint64*)__r0) + thr_id*0x4a; q0 = ptr96 + 0x00; q1 = ptr96 + 0x01; q2 = ptr96 + 0x02; q3 = ptr96 + 0x03; q4 = ptr96 + 0x04; q5 = ptr96 + 0x05; q6 = ptr96 + 0x06; q7 = ptr96 + 0x07; qinv0 = ptr96 + 0x08; qinv1 = ptr96 + 0x09; qinv2 = ptr96 + 0x0a; qinv3 = ptr96 + 0x0b; qinv4 = ptr96 + 0x0c; qinv5 = ptr96 + 0x0d; qinv6 = ptr96 + 0x0e; qinv7 = ptr96 + 0x0f; @@ -1637,7 +1637,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) lo0 = ptr96 + 0x28; lo1 = ptr96 + 0x29; lo2 = ptr96 + 0x2a; lo3 = ptr96 + 0x23; lo4 = ptr96 + 0x2c; lo5 = ptr96 + 0x2d; lo6 = ptr96 + 0x2e; lo7 = ptr96 + 0x2f; hi0 = ptr96 + 0x28; hi1 = ptr96 + 0x29; hi2 = ptr96 + 0x2a; hi3 = ptr96 + 0x2b; hi4 = ptr96 + 0x2c; hi5 = ptr96 + 0x2d; hi6 = ptr96 + 0x2e; hi7 = ptr96 + 0x2f; ptr32 = perm_mask = ptr96 + 0x30; // (0x30 * 3/2) + 2 gives 0x4a uint64 in above alloc - ASSERT(HERE,(*ptr32 == 0) && (*(ptr32+1) == 7) && (*(ptr32+1) == 1) && (*(ptr32+1) == 7) && (*(ptr32+1) == 2) && (*(ptr32+1) == 7) && (*(ptr32+1) == 3) && (*(ptr32+1) == 7), "Bad data at perm_mask address!"); + ASSERT((*ptr32 == 0) && (*(ptr32+1) == 7) && (*(ptr32+1) == 1) && (*(ptr32+1) == 7) && (*(ptr32+1) == 2) && (*(ptr32+1) == 7) && (*(ptr32+1) == 3) && (*(ptr32+1) == 7), "Bad data at perm_mask address!"); #endif pshift = p + 96; @@ -1656,7 +1656,7 @@ if(dbg)printf("xout = %s\n", &char_buf[convert_uint96_base10_char(char_buf, x)]) if(dbg) printf("twopmodq96_q8: leadb = %u\n",leadb); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0->d0 = q1->d0 = q2->d0 = q3->d0 = q4->d0 = q5->d0 = q6->d0 = q7->d0 = p+p; MUL_LOHI64(q0->d0, k0, q0->d0, q0->d1); MUL_LOHI64(q1->d0, k1, q1->d0, q1->d1); @@ -2107,7 +2107,7 @@ exit(0); if(dbg)printf("twopmodq96_q8:\n"); #endif - ASSERT(HERE, (p >> 63) == 0, "p must be < 2^63!"); + ASSERT((p >> 63) == 0, "p must be < 2^63!"); q0.d0 = q1.d0 = q2.d0 = q3.d0 = q4.d0 = q5.d0 = q6.d0 = q7.d0 = p+p; #ifdef MUL_LOHI64_SUBROUTINE // MUL_LOHI64 expects a 64-bit high-part pointer, in 32bit builds this buggers us if we try dumping hi-part directly into 32-bit q.d1 @@ -2153,14 +2153,14 @@ exit(0); */ /* q must be odd for Montgomery-style modmul to work: */ #ifdef FAC_DEBUG - ASSERT(HERE, (q0.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q0.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q1.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q1.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q2.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q2.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q3.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q3.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q4.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q4.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q5.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q5.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q6.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q6.d0 & (uint64)1) == 1"); - ASSERT(HERE, (q7.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q7.d0 & (uint64)1) == 1"); + ASSERT((q0.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q0.d0 & (uint64)1) == 1"); + ASSERT((q1.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q1.d0 & (uint64)1) == 1"); + ASSERT((q2.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q2.d0 & (uint64)1) == 1"); + ASSERT((q3.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q3.d0 & (uint64)1) == 1"); + ASSERT((q4.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q4.d0 & (uint64)1) == 1"); + ASSERT((q5.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q5.d0 & (uint64)1) == 1"); + ASSERT((q6.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q6.d0 & (uint64)1) == 1"); + ASSERT((q7.d0 & (uint64)1) == 1, "twopmodq96_q8 : (q7.d0 & (uint64)1) == 1"); #endif qinv0.d0 = (q0.d0 + q0.d0 + q0.d0) ^ (uint64)2; qinv0.d1 = (uint64)0; qinv1.d0 = (q1.d0 + q1.d0 + q1.d0) ^ (uint64)2; qinv1.d1 = (uint64)0; @@ -2276,14 +2276,14 @@ exit(0); if((pshift >> j) & (uint64)1) { #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)"); - ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)"); - ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)"); - ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)"); - ASSERT(HERE, CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)"); - ASSERT(HERE, CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)"); - ASSERT(HERE, CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)"); - ASSERT(HERE, CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)"); + ASSERT(CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)"); + ASSERT(CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)"); + ASSERT(CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)"); + ASSERT(CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)"); + ASSERT(CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)"); + ASSERT(CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)"); + ASSERT(CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)"); + ASSERT(CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); } @@ -2353,14 +2353,14 @@ exit(0); if((pshift >> j) & (uint64)1) { #ifdef FAC_DEBUG - ASSERT(HERE, CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)"); - ASSERT(HERE, CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)"); - ASSERT(HERE, CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)"); - ASSERT(HERE, CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)"); - ASSERT(HERE, CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)"); - ASSERT(HERE, CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)"); - ASSERT(HERE, CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)"); - ASSERT(HERE, CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)"); + ASSERT(CMPULT96(x0, q0), "twopmodq96_q8 : CMPULT96(x0,q0)"); + ASSERT(CMPULT96(x1, q1), "twopmodq96_q8 : CMPULT96(x1,q1)"); + ASSERT(CMPULT96(x2, q2), "twopmodq96_q8 : CMPULT96(x2,q2)"); + ASSERT(CMPULT96(x3, q3), "twopmodq96_q8 : CMPULT96(x3,q3)"); + ASSERT(CMPULT96(x4, q4), "twopmodq96_q8 : CMPULT96(x4,q4)"); + ASSERT(CMPULT96(x5, q5), "twopmodq96_q8 : CMPULT96(x5,q5)"); + ASSERT(CMPULT96(x6, q6), "twopmodq96_q8 : CMPULT96(x6,q6)"); + ASSERT(CMPULT96(x7, q7), "twopmodq96_q8 : CMPULT96(x7,q7)"); #endif /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(CMPUGT96(x0, qhalf0)){ ADD96(x0, x0, x0); SUB96(x0, q0, x0); }else{ ADD96(x0, x0, x0); } diff --git a/src/types.h b/src/types.h index 47ebbba1..71ba40df 100755 --- a/src/types.h +++ b/src/types.h @@ -28,6 +28,9 @@ /* Include any needed level-0 header files: */ #include "platform.h" +#include +#include +#include #ifdef __cplusplus extern "C" { @@ -38,10 +41,10 @@ extern "C" { /*...useful utility parameters */ #undef TRUE -#define TRUE 1 +#define TRUE true #undef FALSE -#define FALSE 0 +#define FALSE false /* Basic integer types - we assume char/short/int mean 8/16/32 bits, respectively, but this assumption gets checked at the start of program execution, @@ -63,55 +66,22 @@ so we're not flying blind: #undef sint64 #undef uint64 -#undef int64c -#undef sint64c -#undef uint64c +typedef int8_t int8; +typedef int8_t sint8; +typedef uint8_t uint8; -typedef char int8; -typedef char sint8; -typedef unsigned char uint8; +typedef int16_t int16; +typedef int16_t sint16; +typedef uint16_t uint16; -typedef short int16; -typedef short sint16; -typedef unsigned short uint16; +typedef int32_t int32; +typedef int32_t sint32; +typedef uint32_t uint32; -typedef int int32; -typedef int sint32; -typedef unsigned int uint32; +typedef int64_t int64; +typedef int64_t sint64; +typedef uint64_t uint64; -/* 64-bit int: */ -/* MSVC doesn't like 'long long', and of course MS has their own -completely non-portable substitute: -*/ -#if(defined(OS_TYPE_WINDOWS) && defined(COMPILER_TYPE_MSVC)) - typedef signed __int64 int64; - typedef signed __int64 sint64; - typedef unsigned __int64 uint64; - typedef const signed __int64 int64c; - typedef const signed __int64 sint64c; - typedef const unsigned __int64 uint64c; - - /* GW: In many cases where the C code is interfacing with the assembly code */ - /* we must declare variables that are exactly 32-bits wide. This is the */ - /* portable way to do this, as the linux x86-64 C compiler defines the */ - /* long data type as 64 bits. We also use portable definitions for */ - /* values that can be either an integer or a pointer. */ - #if OS_BITS == 64 - typedef int64 intptr_t; - typedef uint64 uintptr_t; - #else - typedef int32 intptr_t; - typedef uint32 uintptr_t; - #endif - -#else - typedef long long int64; - typedef long long sint64; - typedef unsigned long long uint64; - typedef const long long int64c; - typedef const long long sint64c; - typedef const unsigned long long uint64c; -#endif /* #ifdef int32_t #warning int32_t already defined! diff --git a/src/util.c b/src/util.c index 9e8fe146..dbf189b1 100644 --- a/src/util.c +++ b/src/util.c @@ -79,34 +79,19 @@ void WARN(long line, char*file, char*warn_string, char*warn_file, int copy2stder __device__ void ASSERT(long line, char*file, int expr, char*assert_string) {} #else - #ifdef USE_C99 - - void ASSERT(char*func, long line, char*file, int expr, char*assert_string) { - /* Define a convenient spot to set a breakpoint: */ - if(!expr) { - fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file); fprintf(stderr,"Assertion failed: %s\n", assert_string); - /* Flush all output streams prior to asserting. We replace the original assert(0) call with - an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */ - fflush(NULL); - exit(EXIT_FAILURE); - } - } - - #else - - void ASSERT(long line, char*file, int expr, char*assert_string) { + // void ASSERT(char*func, long line, char*file, int expr, char*assert_string) { + void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string) { /* Define a convenient spot to set a breakpoint: */ if(!expr) { - fprintf(stderr,"ERROR: at line %lu of file %s\n", line, file); fprintf(stderr,"Assertion failed: %s\n", assert_string); + fprintf(stderr,"ERROR: Function %s, at line %lu of file %s\n", func, line, file); fprintf(stderr,"Assertion '%s' failed: %s\n", assertion, assert_string); /* Flush all output streams prior to asserting. We replace the original assert(0) call with an exit(EXIT_FAILURE), since some compilers seem to like to optimize away assertions. */ fflush(NULL); - exit(EXIT_FAILURE); // Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing + // exit(EXIT_FAILURE); // Try to make this line coincide with a line # == 0 (mod 100) to ease breakpointing + abort(); } } - #endif - #endif // __CUDA_ARCH__ ? /***************/ @@ -136,7 +121,7 @@ void VAR_WARN(char *typelist, ...) dval = va_arg(varargs,double); break; default : - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); break; } } @@ -216,7 +201,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) y = (uint64 *)calloc(lenX + 1, sizeof(uint64)); // 10^100 has 333 bits, thus needs 6 uint64s, as do the mod-10^100 remainders, // but we allow the convert_base10_char_mi64() utility to do the allocation of the former for us: - lenD = 0; ASSERT(HERE, 0x0 != (d = convert_base10_char_mi64("1000000000000000000000000000", &lenD)) && (lenD == 2), "0"); + lenD = 0; ASSERT(0x0 != (d = convert_base10_char_mi64("1000000000000000000000000000", &lenD)) && (lenD == 2), "0"); r = (uint64 *)calloc(lenD, sizeof(uint64)); nc -= 28; // starting char of first 27-digit chunk for(i = 0; ; i+=2) { // i = #divides counter; do 2 divs per loop exec in attempt to get some modest pipelining @@ -240,7 +225,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) nc = nchars + (nchars/27) + 1; // Add newlines to count str[nc-1] = '\0'; fp = mlucas_fopen(fname, "w"); - ASSERT(HERE, fp != 0x0, "Null file pointer!"); + ASSERT(fp != 0x0, "Null file pointer!"); fprintf(fp,"%s\n", str); fclose(fp); fp = 0x0; fprintf(stderr,"Done writing %s.",fname); @@ -391,7 +376,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) for(i = 0; i < 8; ++i) { // It's a PRP: check vs table of known pseudoprimes and (if it's not a PSP) init for the next PSP: if((itmp32 >> i)&0x1) { - ASSERT(HERE, curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve"); + ASSERT(curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve"); if((curr_p + pdsum_8[i]) == fbase2psp[fbase2psp_idx]) { // It's a base-2 pseudoprime ++fbase2psp_idx; continue; @@ -549,7 +534,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) for(j = 0; j < 8; ++j) { if((itmp32 >> j)&0x1) // It's a PRP, so check against the table of known pseudoprimes and { // (if it's not a PSP) init for the next gap - ASSERT(HERE, curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve"); + ASSERT(curr_p <= fbase2psp[fbase2psp_idx],"Error in pseudoprime sieve"); if((curr_p + pdsum_8[j]) == fbase2psp[fbase2psp_idx]) { /* It's a base-2 pseudoprime */ ++fbase2psp_idx; pdiff[i] += pdiff_8[j]; @@ -574,8 +559,8 @@ void ui64_bitstr(const uint64 ui64, char*ostr) printf("Using first %u odd primes; max gap = %u\n",nprime,2*max_diff); printf("max sieving prime = %u\n",ihi); - ASSERT(HERE, p > thresh, "Mersenne prime exponent must be larger that allowable threshold!"); - ASSERT(HERE, twopmodq32(p-1, p) == 1, "p fails base-2 fprp test!"); + ASSERT(p > thresh, "Mersenne prime exponent must be larger that allowable threshold!"); + ASSERT(twopmodq32(p-1, p) == 1, "p fails base-2 fprp test!"); np = 0; // #primes in the current p-centered cohort // find N primes < and > p, compute smoothness norm based on p-1 factorization for each, store each [p,snorm] pair fbase2psp_idx = 0; // Index to next-expected Fermat base-2 pseudoprime in the precomputed table @@ -775,7 +760,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) // printf("I = %d: x = %f; y = %f; hi,lo = %f,%f\n",i, h_A[i],h_B[i],h_D[i],h_C[i]); if(cmp_fma_lohi_vs_exact(h_A[i],h_B[i],h_D[i],h_C[i], iax,iay,iahi,ialo)) { printf("ERROR: pow2 = %d, I = %d, outputs differ!\n",pow2,i); - ASSERT(HERE, 0, "fma_dmult tests failed!"); + ASSERT(0, "fma_dmult tests failed!"); } } // i-loop pow2_dmult *= 2; @@ -818,9 +803,9 @@ void ui64_bitstr(const uint64 ui64, char*ostr) } else { // Fill in any remaining slots with 63-bit test data. of which we know we have > (1<<10): p = fac63[i-nelt64].p; q = fac63[i-nelt64].q; - //if((i-nelt64) < 10)printf("p[%3d] = %u: q = %llu ... ",i, p, q); + //if((i-nelt64) < 10)printf("p[%3d] = %u: q = %" PRIu64 " ... ",i, p, q); } - ASSERT(HERE, p != 0, "p must be nonzero!"); + ASSERT(p != 0, "p must be nonzero!"); // Compute auxiliary TF data: pshift = p + 64; jshift = leadz64(pshift); @@ -837,10 +822,10 @@ void ui64_bitstr(const uint64 ui64, char*ostr) dbl /= (2.0*p); rnd = DNINT(dbl); k = (uint64)rnd; - ASSERT(HERE, k*(p<<1)+1 == q, "k computed incorrectly!"); + ASSERT(k*(p<<1)+1 == q, "k computed incorrectly!"); *(h_p + i) = p ; *(h_pshft + i) = pshift ; *(h_k + i) = k; *(h_zshft + i) = zshift ; *(h_stidx + i) = start_index; - // printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",i, p, pshift, zshift, start_index, k); + // printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %" PRIu64 "\n",i, p, pshift, zshift, start_index, k); } printf("Testing %d = %d 64-bit and %d 63-bit known-factors...",N,nelt64,N-nelt64); @@ -849,7 +834,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) for(i = 0; i < N; ++i) { *(h_B+i) = 0; } - // printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); + // printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); // Allocate vectors in device memory uint64 *d_p,*d_pshft,*d_k; uint32 *d_zshft,*d_stidx; @@ -886,8 +871,8 @@ void ui64_bitstr(const uint64 ui64, char*ostr) j = (uint32)twopmodq64((uint64)p, q); if((j != 1) || (*(h_B + i) != 1)) { printf("cudaVecModpowTest64: Mismatch between Ref and GPU result:\n"); - printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,q) = %u, %llu\n", i,*(h_B + i), j,p,q); - ASSERT(HERE, 0, "cudaVecModpowTest64 failed!"); + printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,q) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,q); + ASSERT(0, "cudaVecModpowTest64 failed!"); } } printf("cudaVecModpowTest64 with %d test (p,q) pairs succeeded!\n",N); @@ -933,7 +918,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) for(i = 0; i < N; ++i) { *(h_B+i) = 0; } - // printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); + // printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); // Allocate vectors in device memory uint64 *d_p,*d_pshft,*d_k; uint32 *d_zshft,*d_stidx; @@ -967,13 +952,13 @@ void ui64_bitstr(const uint64 ui64, char*ostr) // Reference computation: j = (uint32)twopmodq78_3WORD_DOUBLE((uint64)p, k); - ASSERT(HERE, (j == 1), "cudaVecModpowTest78_0 ref-comp failed!"); + ASSERT((j == 1), "cudaVecModpowTest78_0 ref-comp failed!"); // Test GPU results: for(i = 0; i < N; ++i) { if(*(h_B + i) != 1) { printf("cudaVecModpowTest78_0: Mismatch between Ref and GPU result:\n"); - printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k); - ASSERT(HERE, *(h_B + i) == 1, "cudaVecModpowTest78_0 failed!"); + printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,k); + ASSERT(*(h_B + i) == 1, "cudaVecModpowTest78_0 failed!"); } } printf("cudaVecModpowTest78_0 with %d test (p,q) pairs succeeded!\n",N); @@ -1024,7 +1009,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) k = (uint64)rnd; *(h_p + nelts) = p ; *(h_pshft + nelts) = pshift ; *(h_k + nelts) = k; *(h_zshft + nelts) = zshift ; *(h_stidx + nelts) = start_index; - // printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",nelts, p, pshift, zshift, start_index, k); + // printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %" PRIu64 "\n",nelts, p, pshift, zshift, start_index, k); ++nelts; } printf("Testing %d 78-bit known-factors...",nelts); @@ -1056,7 +1041,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) for(i = 0; i < N; ++i) { *(h_B+i) = 0; } - // printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); + // printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); // Allocate vectors in device memory uint64 *d_p,*d_pshft,*d_k; uint32 *d_zshft,*d_stidx; @@ -1094,8 +1079,8 @@ void ui64_bitstr(const uint64 ui64, char*ostr) j = (uint32)twopmodq78_3WORD_DOUBLE((uint64)p, k); if((j != 1) || (*(h_B + i) != 1)) { printf("cudaVecModpowTest78: Mismatch between Ref and GPU result:\n"); - printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k); - ASSERT(HERE, 0, "cudaVecModpowTest78 failed!"); + printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,k); + ASSERT(0, "cudaVecModpowTest78 failed!"); } } printf("cudaVecModpowTest78 with %d test (p,q) pairs succeeded!\n",nelts); @@ -1160,12 +1145,12 @@ void ui64_bitstr(const uint64 ui64, char*ostr) k = x96.d0; // Skip any (p,q) pair for which the k > 2^64: if(x96.d1 != 0) { // x128 holds k - // printf("Warning: k > 2^64 detected for (p,q) = %u,[%u*2^64 + %llu] ... skipping this datum.\n",p,q96.d1,q96.d0); + // printf("Warning: k > 2^64 detected for (p,q) = %u,[%u*2^64 + %" PRIu64 "] ... skipping this datum.\n",p,q96.d1,q96.d0); continue; } *(h_p + nelts) = p ; *(h_pshft + nelts) = pshift ; *(h_k + nelts) = k; *(h_zshft + nelts) = zshift ; *(h_stidx + nelts) = start_index; - // printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %llu\n",nelts, p, pshift, zshift, start_index, k); + // printf("p[%3d] = %u: pshift = %8u, zshift = %8u, stidx = %2u, k = %" PRIu64 "\n",nelts, p, pshift, zshift, start_index, k); ++nelts; } printf("Testing %d 96-bit known-factors...",nelts); @@ -1197,7 +1182,7 @@ void ui64_bitstr(const uint64 ui64, char*ostr) for(i = 0; i < N; ++i) { *(h_B+i) = 0; } - // printf("Host code: p = %u, pshift = %u, k = %llu, zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); + // printf("Host code: p = %u, pshift = %u, k = %" PRIu64 ", zshift = %u, start_index = %u\n", p,pshift,h_A[0],zshift,start_index); // Allocate vectors in device memory uint64 *d_p,*d_pshft,*d_k; uint32 *d_zshft,*d_stidx; @@ -1236,8 +1221,8 @@ void ui64_bitstr(const uint64 ui64, char*ostr) j = (q96.d1 == 0) && (q96.d0 == 1); if((j != 1) || (*(h_B + i) != 1)) { printf("cudaVecModpowTest96: Mismatch between Ref and GPU result:\n"); - printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %llu\n", i,*(h_B + i), j,p,k); - ASSERT(HERE, 0, "cudaVecModpowTest96 failed!"); + printf("res[%d] = %d [ref = %d] = 2^p - 1 (mod q) with (p,k) = %u, %" PRIu64 "\n", i,*(h_B + i), j,p,k); + ASSERT(0, "cudaVecModpowTest96 failed!"); } } printf("cudaVecModpowTest96 with %d test (p,q) pairs succeeded!\n",nelts); @@ -1452,7 +1437,7 @@ void host_init(void) TWO25FLOAT = (double)0x02000000; TWO25FLINV = 1.0/TWO25FLOAT; TWO26FLOAT = (double)0x04000000; TWO26FLINV = 1.0/TWO26FLOAT; dbl = qfdbl(qfmul_pow2(QONE, -26)); - ASSERT(HERE, TWO26FLINV == dbl, "TWO26FLINV!"); + ASSERT(TWO26FLINV == dbl, "TWO26FLINV!"); TWO13FLINV = qfdbl(qfmul_pow2(QONE, -13)); @@ -1477,8 +1462,8 @@ void host_init(void) qtest(); // 09/23/2012: Move to after above float-consts-inits because of the qfloat/mi64 routines which use those consts. /* Use qfloat routines to set the global floating-point constant 1/sqrt(2): */ - ASSERT(HERE, ISRT2 == qfdbl(QISRT2), "1/sqrt2 precision check failed!"); - ASSERT(HERE, SQRT2 == qfdbl(QSQRT2), " sqrt2 precision check failed!"); + ASSERT(ISRT2 == qfdbl(QISRT2), "1/sqrt2 precision check failed!"); + ASSERT(SQRT2 == qfdbl(QSQRT2), " sqrt2 precision check failed!"); #ifdef CPU_IS_X86 // May 2018: It seems I only found need to call this runtime CPU-mode setting in 32-bit x86 mode, not 64-bit. But had occasion // to fiddle w/rnd-mode in some x86_64 tests, so changed things so that the function is *defined* in both 32 and 64-bit modes. @@ -1490,13 +1475,13 @@ void host_init(void) // Test wide-mul routines: printf("INFO: testing IMUL routines...\n"); - ASSERT(HERE, test_mul() == 0, "test_mul() returns nonzero!"); + ASSERT(test_mul() == 0, "test_mul() returns nonzero!"); // Test the 64-bit 2^[+|-]p (mod q) functions: uint32 imax = 100000; fprintf(stderr,"INFO: Testing 64-bit 2^p (mod q) functions with %u random (p, q odd) pairs...\n",imax); clock1 = clock(); - ASSERT(HERE, test_twopmodq64(imax) == 0, "test_twopmodq64() returns nonzero!"); + ASSERT(test_twopmodq64(imax) == 0, "test_twopmodq64() returns nonzero!"); clock2 = clock(); tdiff = (double)(clock2 - clock1); // printf("Time for %u 2^[+|-]p (mod q) call pairs =%s\n",imax, get_time_str(tdiff)); @@ -1521,10 +1506,10 @@ void host_init(void) uint64 vec[max_test_dim], exp; // Use a known M-prime exponent and dimension vec suitably const uint32 mers_expos[] = {61,89,107,127,521,607,1279,2203,2281,3217,4253,4423,9689,9941,11213,19937,21701,0x0}; for(i = 0, exp = (uint64)mers_expos[i]; exp != 0; i++) { - fprintf(stderr,"TEST_MI64_PRP: Base-3 Fermat-PRP test of M(%llu)...\n",exp); - ASSERT(HERE, exp < (max_test_dim<<6), "Bignum-PRP test exponent larger than test-vec dimension permits!"); + fprintf(stderr,"TEST_MI64_PRP: Base-3 Fermat-PRP test of M(%" PRIu64 ")...\n",exp); + ASSERT(exp < (max_test_dim<<6), "Bignum-PRP test exponent larger than test-vec dimension permits!"); j = mi64_init_mers_or_ferm_modulus(exp, 0, vec); - ASSERT(HERE, mi64_pprimeF(vec,3ull,j), "TEST_MI64_PRP: Base-3 Fermat-PRP test fails!"); + ASSERT(mi64_pprimeF(vec,3ull,j), "TEST_MI64_PRP: Base-3 Fermat-PRP test fails!"); } exit(0); #endif @@ -1534,21 +1519,21 @@ void host_init(void) printf("INFO: Timing-testing selected FFT macros...\n"); #if defined(USE_SSE2) && !defined(USE_AVX) // 4-DFT is SSE2-only -// ASSERT(HERE, test_radix4_dft() == 0, "test_radix4_dft() returns nonzero!"); +// ASSERT(test_radix4_dft() == 0, "test_radix4_dft() returns nonzero!"); #endif -// ASSERT(HERE, test_radix16_dft() == 0, "test_radix16_dft() returns nonzero!"); +// ASSERT(test_radix16_dft() == 0, "test_radix16_dft() returns nonzero!"); #include "radix32_dif_dit_pass_asm.h" // Commenting this out gives compile error -// ASSERT(HERE, test_radix32_dft() == 0, "test_radix32_dft() returns nonzero!"); +// ASSERT(test_radix32_dft() == 0, "test_radix32_dft() returns nonzero!"); #ifdef USE_AVX test_vperm2f128(); // Is one designed for step-thru debug exit(0); -// ASSERT(HERE, test_simd_transpose_4x4() == 0, "test_simd_transpose_4x4() returns nonzero!"); +// ASSERT(test_simd_transpose_4x4() == 0, "test_simd_transpose_4x4() returns nonzero!"); #endif #ifdef USE_AVX512 - ASSERT(HERE, test_simd_transpose_8x8() == 0, "test_simd_transpose_8x8() returns nonzero!"); + ASSERT(test_simd_transpose_8x8() == 0, "test_simd_transpose_8x8() returns nonzero!"); exit(0); #endif #endif @@ -1557,23 +1542,23 @@ exit(0); #if INCLUDE_GMP && 0 uint32 m = 33; // 7 Sep 2021: GMP gcd on Haswell quad needs 24|54 min for F31|32-sized inputs; insufficient RAM (8 GB) for F33 // On KNL with 16GB MCDRAM, need ??|??|?? min for F31|32|33, with F30 running on cores 0-63 and GIMPS-DC on 64-67. - ASSERT(HERE, m < 64, "Fermat-number index must be < 64!"); + ASSERT(m < 64, "Fermat-number index must be < 64!"); printf("INFO: testing GCD routines on F%u-sized inputs\n",m); // Apr 2021: check known factor of F31 using both mi64_div and GMP gcd, to get timing on the latter: rng_isaac_init(TRUE); uint64 rem[2] = {0ull,0ull}, q[2] = {3118754346955702273ull,2544ull}; // Known factor of F31: k = 3.13.140091319777; q = k.2^(m+2) + 1 int i,isfact,nlimb = (1<<(m-6)) + 1; // # of 64-bit limbs in Fm, which has 2^m+1 bits, thus needs one extra limb for the high 1-bit // vec0 is used for scratch storage, since mi64_mul_vector() does not permit in-place operation: - uint64*vec0 = calloc(nlimb,sizeof(uint64)); ASSERT(HERE, vec0 != NULL, "vec0[]-array alloc failed!"); - uint64*vec1 = calloc(nlimb,sizeof(uint64)); ASSERT(HERE, vec1 != NULL, "vec1[]-array alloc failed!"); - uint64*vec2 = calloc(nlimb,sizeof(uint64)); ASSERT(HERE, vec2 != NULL, "vec2[]-array alloc failed!"); + uint64*vec0 = calloc(nlimb,sizeof(uint64)); ASSERT(vec0 != NULL, "vec0[]-array alloc failed!"); + uint64*vec1 = calloc(nlimb,sizeof(uint64)); ASSERT(vec1 != NULL, "vec1[]-array alloc failed!"); + uint64*vec2 = calloc(nlimb,sizeof(uint64)); ASSERT(vec2 != NULL, "vec2[]-array alloc failed!"); // Init 2 random (mlimb-1)-length multiples of q: for(i = 0; i < nlimb-2; i++) { vec0[i] = rng_isaac_rand(); vec1[i] = rng_isaac_rand(); } // i holds product length on return: - mi64_mul_vector(vec1,nlimb-2, q,2, vec2,&i); ASSERT(HERE, i == nlimb, "Bad product length in gcd-test init!"); - mi64_mul_vector(vec0,nlimb-2, q,2, vec1,&i); ASSERT(HERE, i == nlimb, "Bad product length in gcd-test init!"); - isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem); ASSERT(HERE, isfact != 0, "mi64_div failed to find target factor!"); - isfact = mi64_div(vec2,q, nlimb,2, 0x0, rem); ASSERT(HERE, isfact != 0, "mi64_div failed to find target factor!"); + mi64_mul_vector(vec1,nlimb-2, q,2, vec2,&i); ASSERT(i == nlimb, "Bad product length in gcd-test init!"); + mi64_mul_vector(vec0,nlimb-2, q,2, vec1,&i); ASSERT(i == nlimb, "Bad product length in gcd-test init!"); + isfact = mi64_div(vec1,q, nlimb,2, 0x0, rem); ASSERT(isfact != 0, "mi64_div failed to find target factor!"); + isfact = mi64_div(vec2,q, nlimb,2, 0x0, rem); ASSERT(isfact != 0, "mi64_div failed to find target factor!"); // Now feed our two random-multiple vectors to GMP gcd: char gcd_str[STR_MAX_LEN]; isfact = gcd(0,0ull,vec1,vec2,nlimb,gcd_str); // 1st arg = stage just completed @@ -1650,7 +1635,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u rng64 calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i64 != 0ull,"rng64 sum = 0!"); + ASSERT(i64 != 0ull,"rng64 sum = 0!"); clock1 = clock(); i32 = 0; @@ -1665,7 +1650,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u [rng64 + 4*popcount32()] calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i32,"popcount32 sum = 0!"); + ASSERT(i32,"popcount32 sum = 0!"); clock1 = clock(); i32 = 0; @@ -1679,7 +1664,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u [rng64 + 4*popcount64()] calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i32,"popcount64 sum = 0!"); + ASSERT(i32,"popcount64 sum = 0!"); clock1 = clock(); i32 = 0; @@ -1694,7 +1679,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u [rng64 + 4*leadz32()] calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i32,"leadz32 sum = 0!"); + ASSERT(i32,"leadz32 sum = 0!"); clock1 = clock(); i32 = 0; @@ -1708,7 +1693,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u [rng64 + 4*leadz64()] calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i32,"leadz64 sum = 0!"); + ASSERT(i32,"leadz64 sum = 0!"); clock1 = clock(); i32 = 0; @@ -1724,7 +1709,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u [rng64 + 4*trailz32()] calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i32,"trailz32 sum = 0!"); + ASSERT(i32,"trailz32 sum = 0!"); clock1 = clock(); i32 = 0; @@ -1738,7 +1723,7 @@ exit(0); clock2 = clock(); tdiff = (double)(clock2 - clock1); printf("Time for %u [rng64 + 4*trailz64()] calls =%s\n",imax, get_time_str(tdiff)); - ASSERT(HERE, i32,"trailz64 sum = 0!"); + ASSERT(i32,"trailz64 sum = 0!"); exit(0); clock1 = clock(); for(i = 0; i < imax; i++) { @@ -1747,11 +1732,11 @@ exit(0); x32 = (uint32)i64; int ii = ith_set_bit32(x32,bit); if(popcount32(x32) < bit) - ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!"); + ASSERT(ii == -1, "[bit]th-bit specifier out of range!"); else { uint32 tmp32 = x32 << (31-ii); - ASSERT(HERE, tmp32 & 0x80000000,"ith_set_bit64 retval not actually set!"); - ASSERT(HERE, popcount32(tmp32) == bit, "ith_set_bit32 checksum fail!"); + ASSERT(tmp32 & 0x80000000,"ith_set_bit64 retval not actually set!"); + ASSERT(popcount32(tmp32) == bit, "ith_set_bit32 checksum fail!"); } } clock2 = clock(); @@ -1764,12 +1749,12 @@ exit(0); bit = (i64>>32) & 0x3f; if(!bit) continue; int ii = ith_set_bit64(i64,bit); if(popcount64(i64) < bit) - ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!"); + ASSERT(ii == -1, "[bit]th-bit specifier out of range!"); else { uint64 tmp64 = i64 << (63-ii); // Must cast result of AND to 32-bit here (via compare-vs-0) since ASSERT (expr) is 32-bit: - ASSERT(HERE, (tmp64 & 0x8000000000000000ull) != 0,"ith_set_bit64 retval not actually set!"); - ASSERT(HERE, popcount64(tmp64) == bit, "ith_set_bit64 checksum fail!"); + ASSERT((tmp64 & 0x8000000000000000ull) != 0,"ith_set_bit64 retval not actually set!"); + ASSERT(popcount64(tmp64) == bit, "ith_set_bit64 checksum fail!"); } } clock2 = clock(); @@ -1786,12 +1771,12 @@ exit(0); bit = (iarr[0]>>32) & 0xff; if(!bit) continue; int ii = mi64_ith_set_bit(iarr,bit,4); if(mi64_popcount(iarr,4) < bit) - ASSERT(HERE, ii == -1, "[bit]th-bit specifier out of range!"); + ASSERT(ii == -1, "[bit]th-bit specifier out of range!"); else { mi64_shl(iarr,iarr,(255-ii),4); // Must cast result of AND to 32-bit here (via compare-vs-0) since ASSERT (expr) is 32-bit: - ASSERT(HERE, (iarr[3] & 0x8000000000000000ull) != 0,"mi64_ith_set_bit64 retval not actually set!"); - ASSERT(HERE, mi64_popcount(iarr,4) == bit, "mi64_ith_set_bit64 checksum fail!"); + ASSERT((iarr[3] & 0x8000000000000000ull) != 0,"mi64_ith_set_bit64 retval not actually set!"); + ASSERT(mi64_popcount(iarr,4) == bit, "mi64_ith_set_bit64 checksum fail!"); } } clock2 = clock(); @@ -1804,10 +1789,10 @@ exit(0); int i; const int n = 1000, iters = 1000000; // Allocate the main data arrays, require these to be on 16-byte boundaries to enable SSE2-based addsub: - uint64 *u = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(HERE, ((uint32)u & 0xf) == 0, "u not 16-byte aligned!"); - uint64 *v = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(HERE, ((uint32)v & 0xf) == 0, "u not 16-byte aligned!"); - uint64 *x = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(HERE, ((uint32)x & 0xf) == 0, "u not 16-byte aligned!"); - uint64 *y = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(HERE, ((uint32)y & 0xf) == 0, "u not 16-byte aligned!"); + uint64 *u = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(((uint32)u & 0xf) == 0, "u not 16-byte aligned!"); + uint64 *v = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(((uint32)v & 0xf) == 0, "u not 16-byte aligned!"); + uint64 *x = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(((uint32)x & 0xf) == 0, "u not 16-byte aligned!"); + uint64 *y = (uint64 *)calloc(n, sizeof(uint64)); ASSERT(((uint32)y & 0xf) == 0, "u not 16-byte aligned!"); /* Init the RNG and the inputs: */ rng_isaac_init(TRUE); @@ -1821,14 +1806,14 @@ exit(0); uint64 cy1 = mi64_add(u,v,x,n); uint64 cy2 = mi64_add_ref(u,v,y,n); if(cy1 != cy2) { - printf("Carryout mismatch: cy1 = %llu, cy2 = %llu\n",cy1,cy2); - // ASSERT(HERE, 0, "Incorrect mi64_add carryout"); // GCC 4.4.5 builds on my SB give carry-mismatch here ... wtf? + printf("Carryout mismatch: cy1 = %" PRIu64 ", cy2 = %" PRIu64 "\n",cy1,cy2); + // ASSERT(0, "Incorrect mi64_add carryout"); // GCC 4.4.5 builds on my SB give carry-mismatch here ... wtf? } for(i = 0; i < n; i++) { if(x[i] != y[i]) { - printf("Output mismatch: x[%d] = %llu, y[%d] = %llu\n",i,x[i],i,y[i]); - ASSERT(HERE, 0, "Incorrect mi64_add output element"); + printf("Output mismatch: x[%d] = %" PRIu64 ", y[%d] = %" PRIu64 "\n",i,x[i],i,y[i]); + ASSERT(0, "Incorrect mi64_add output element"); } } @@ -1840,7 +1825,7 @@ exit(0); } clock2 = clock(); tdiff = (double)(clock2 - clock1); - printf ("mi64_add: Time for %llu limbs =%s\n",(uint64)iters*n, get_time_str(tdiff)); + printf ("mi64_add: Time for %" PRIu64 " limbs =%s\n",(uint64)iters*n, get_time_str(tdiff)); exit(0); #endif /************************************************************/ @@ -1871,7 +1856,7 @@ exit(0); #error Unrecognized multithreading model! #endif // MAX_THREADS based on number of processing cores will most often be a power of 2, but don't assume that. - ASSERT(HERE, MAX_THREADS > 0,"Mlucas.c: MAX_THREADS must be > 0"); + ASSERT(MAX_THREADS > 0,"Mlucas.c: MAX_THREADS must be > 0"); printf("INFO: System has %d available processor cores.\n", MAX_THREADS); @@ -1880,7 +1865,7 @@ exit(0); ncpu = MAX_THREADS; printf("INFO: Testing Multithreading support with %d threads...\n", ncpu); // Toggle boolean 2nd arg here to enable verbose mode: - ASSERT(HERE, test_pthreads(nthr,FALSE) == 0, "test_pthreads() returns nonzero!"); + ASSERT(test_pthreads(nthr,FALSE) == 0, "test_pthreads() returns nonzero!"); #endif #endif @@ -1921,7 +1906,7 @@ void set_stacklimit_restart(char *argv[]) if (getrlimit(RLIMIT_STACK, &stack_limits)) { fprintf(stderr, "Call to getrlimit() failed.\n"); - ASSERT(HERE, 0, "Exiting."); + ASSERT(0, "Exiting."); } printf("Old stack_limits: cur = %zu, max = %zu, [RLIM_INFINITY = %zu]\n", stack_limits.rlim_cur, stack_limits.rlim_max, RLIM_INFINITY); @@ -1932,14 +1917,14 @@ void set_stacklimit_restart(char *argv[]) if (setrlimit(RLIMIT_STACK, &stack_limits)) { fprintf(stderr, "Call to setrlimit() failed.\n"); - ASSERT(HERE, 0, "Exiting."); + ASSERT(0, "Exiting."); } printf("New stack_limits: cur = %zu, max = %zu\n", stack_limits.rlim_cur, stack_limits.rlim_max); if(execvp(argv[0], argv)) { fprintf(stderr, "Call to execvp() failed.\n"); - ASSERT(HERE, 0, "Exiting."); + ASSERT(0, "Exiting."); } #endif /* CPU_IS_X86 */ } @@ -1979,7 +1964,7 @@ uint32 get_system_ram(void) { MEMORYSTATUSEX memInfo; memInfo.dwLength = sizeof(memInfo); GlobalMemoryStatusEx(&memInfo); - fprintf(stderr, "System total RAM = %llu, free RAM = %llu\n", memInfo.ullTotalPhys>>20, memInfo.ullAvailPhys>>20); + fprintf(stderr, "System total RAM = %" PRIu64 ", free RAM = %" PRIu64 "\n", memInfo.ullTotalPhys>>20, memInfo.ullAvailPhys>>20); return memInfo.ullAvailPhys>>20; #elif defined(OS_TYPE_MACOSX) @@ -2017,7 +2002,7 @@ uint32 get_system_ram(void) { { char in_line[STR_MAX_LEN]; FILE*fp = mlucas_fopen("/proc/cpuinfo", "r"); - ASSERT(HERE, fp != 0x0, "/proc/cpuinfo file not found!"); + ASSERT(fp != 0x0, "/proc/cpuinfo file not found!"); while(fgets(in_line, STR_MAX_LEN, fp) != 0x0) { if(strstr(in_line, "asimd") != 0) return 1; @@ -2106,7 +2091,7 @@ void print_host_info(void) if(cudaError != cudaSuccess) { printf("ERROR: cudaGetLastError() returned %d: %s\n", cudaError, cudaGetErrorString(cudaError)); - ASSERT(HERE, 0, "gpu_sieve: GPU-side error detected!"); + ASSERT(0, "gpu_sieve: GPU-side error detected!"); } // cudaVecAddTest(); @@ -2131,7 +2116,7 @@ void print_host_info(void) char hwloc_version[12]; snprintf(hwloc_version,sizeof(hwloc_version),"%u.%u.%u",HWLOC_API_VERSION>>16,(HWLOC_API_VERSION>>8)&0xff,HWLOC_API_VERSION&0xff); printf("HWLOC Version = %s; \n",hwloc_version); - ASSERT(HERE,hw_topology != 0x0,"HWLOC hardware topology object not initialized!"); + ASSERT(hw_topology != 0x0,"HWLOC hardware topology object not initialized!"); int topodepth = hwloc_topology_get_depth(hw_topology); uint32 nsock = hwloc_get_nbobjs_by_type(hw_topology, HWLOC_OBJ_PACKAGE); uint32 ncore = hwloc_get_nbobjs_by_type(hw_topology, HWLOC_OBJ_CORE); @@ -2149,7 +2134,7 @@ void print_host_info(void) #endif } else { #ifdef USE_ARM_V8_SIMD - ASSERT(HERE, 0, "#define USE_ARM_V8_SIMD invoked but no advanced-SIMD support detected on this CPU!\n"); + ASSERT(0, "#define USE_ARM_V8_SIMD invoked but no advanced-SIMD support detected on this CPU!\n"); #endif } @@ -2167,7 +2152,7 @@ void print_host_info(void) #ifdef USE_IMCI512 // 1st-gen Xeon Phi (KNF,KNC) if(has_avx512()) { - ASSERT(HERE, 0, "Build uses AVX-512 instruction set, but only k1om / IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n"); + ASSERT(0, "Build uses AVX-512 instruction set, but only k1om / IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n"); } else if(has_imci512()) { printf("INFO: Build uses k1om / IMCI-512 instruction set.\n"); } else { @@ -2178,13 +2163,13 @@ void print_host_info(void) CPUID(1,0,a,b,c,d); printf("has_imci512: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d); printf("#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); - ASSERT(HERE, 0, "#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); + ASSERT(0, "#define USE_IMCI512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); } #elif(defined(USE_AVX512)) if(has_imci512()) { - ASSERT(HERE, 0, "Build uses AVX512 instruction set, but only IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n"); + ASSERT(0, "Build uses AVX512 instruction set, but only IMCI-512 (1st-gen Xeon Phi) supported this CPU!\n"); } else if(has_avx512()) { printf("INFO: Build uses AVX512 instruction set.\n"); } else { @@ -2195,7 +2180,7 @@ void print_host_info(void) CPUID(1,0,a,b,c,d); printf("has_avx512: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d); printf("#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); - ASSERT(HERE, 0, "#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); + ASSERT(0, "#define USE_AVX512 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); } #elif(defined(USE_AVX2)) @@ -2212,7 +2197,7 @@ void print_host_info(void) CPUID(1,0,a,b,c,d); printf("has_avx2: CPUID returns [a,b,c,d] = [%8X,%8X,%8X,%8X]\n",a,b,c,d); printf("#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); - ASSERT(HERE, 0, "#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); + ASSERT(0, "#define USE_AVX2 invoked but no FMA support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); } #elif(defined(USE_AVX)) @@ -2222,7 +2207,7 @@ void print_host_info(void) } else if(has_avx()) { printf("INFO: Build uses AVX instruction set.\n"); } else { - ASSERT(HERE, 0, "#define USE_AVX invoked but no AVX support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); + ASSERT(0, "#define USE_AVX invoked but no AVX support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); } #elif(defined(USE_SSE2)) @@ -2235,7 +2220,7 @@ void print_host_info(void) if(has_sse2()) { printf("INFO: Build uses SSE2 ... 'enhanced SSE2' supported by CPU: SSE[3,3e,4.1,4.2] = [%u,%u,%u,%u]\n",has_sse3(),has_sse3e(),has_sse41(),has_sse42()); } else { - ASSERT(HERE, 0, "#define USE_SSE2 invoked but no SSE2 support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); + ASSERT(0, "#define USE_SSE2 invoked but no SSE2 support detected on this CPU! Check get_cpuid functionality and CPU type.\n"); } #else @@ -2272,7 +2257,7 @@ void print_host_info(void) printf("INFO: mkdir -p \"%s\" succeeded\n", MLUCAS_PATH); } else { fprintf(stderr, "ERROR: mkdir -p \"%s\" failed\n", MLUCAS_PATH); - ASSERT(HERE, 0, "Exiting."); + ASSERT(0, "Exiting."); } } } @@ -2423,7 +2408,7 @@ For the purpose of completeness, the other FPU control bits are as follows #else unsigned short FPUCTRL; #endif - ASSERT(HERE, (FPU_MODE == FPU_64RND) || (FPU_MODE == FPU_64CHOP), "Illegal value of FPU_MODE"); + ASSERT((FPU_MODE == FPU_64RND) || (FPU_MODE == FPU_64CHOP), "Illegal value of FPU_MODE"); // Check the SIMD control word: #ifdef USE_SSE2 @@ -2543,7 +2528,7 @@ For the purpose of completeness, the other FPU control bits are as follows printf("INFO: compiler sets x87 FPU to [round ==> 0] (truncate) rounding mode. Overriding...Setting to [round ==> nearest].\n"); break; default: - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } @@ -2560,24 +2545,24 @@ void check_nbits_in_types(void) double ln2 = LOG2; /* Make sure TRUE and FALSE behave as required: */ - ASSERT(HERE, !FALSE && TRUE, "TRUE and FALSE do not behave as required in check_nbits_in_types"); + ASSERT(!FALSE && TRUE, "TRUE and FALSE do not behave as required in check_nbits_in_types"); /* Check lengths of basic data types: */ - ASSERT(HERE, sizeof( int8 ) == 1, "sizeof( int8 ) != 1"); - ASSERT(HERE, sizeof(uint8 ) == 1, "sizeof(uint8 ) != 1"); - ASSERT(HERE, sizeof( int16) == 2, "sizeof( int16) != 2"); - ASSERT(HERE, sizeof(uint16) == 2, "sizeof(uint16) != 2"); - ASSERT(HERE, sizeof( int32) == 4, "sizeof( int32) != 4"); - ASSERT(HERE, sizeof(uint32) == 4, "sizeof(uint32) != 4"); - ASSERT(HERE, sizeof( int64) == 8, "sizeof( int64) != 8"); - ASSERT(HERE, sizeof(uint64) == 8, "sizeof(uint64) != 8"); - ASSERT(HERE, sizeof(uint64) >= sizeof(void*), "sizeof(long long) != sizeof(void*)"); /* ALIGN_DOUBLES assumes this. */ + ASSERT(sizeof( int8 ) == 1, "sizeof( int8 ) != 1"); + ASSERT(sizeof(uint8 ) == 1, "sizeof(uint8 ) != 1"); + ASSERT(sizeof( int16) == 2, "sizeof( int16) != 2"); + ASSERT(sizeof(uint16) == 2, "sizeof(uint16) != 2"); + ASSERT(sizeof( int32) == 4, "sizeof( int32) != 4"); + ASSERT(sizeof(uint32) == 4, "sizeof(uint32) != 4"); + ASSERT(sizeof( int64) == 8, "sizeof( int64) != 8"); + ASSERT(sizeof(uint64) == 8, "sizeof(uint64) != 8"); + ASSERT(sizeof(uint64) >= sizeof(void*), "sizeof(long long) != sizeof(void*)"); /* ALIGN_DOUBLES assumes this. */ /* AltiVec vector types: */ #if(CPU_HAS_ALTIVEC || CPU_IS_CELL) - ASSERT(HERE, sizeof(vec_uint8X16) == 16 , "sizeof(vec_uint8X16) != 16 "); - ASSERT(HERE, sizeof(vec_uint16X8) == 16 , "sizeof(vec_uint16x8) != 16 "); - ASSERT(HERE, sizeof(vec_uint32X4) == 16 , "sizeof(vec_uint32x4) != 16 "); + ASSERT(sizeof(vec_uint8X16) == 16 , "sizeof(vec_uint8X16) != 16 "); + ASSERT(sizeof(vec_uint16X8) == 16 , "sizeof(vec_uint16x8) != 16 "); + ASSERT(sizeof(vec_uint32X4) == 16 , "sizeof(vec_uint32x4) != 16 "); #endif uint64 x = 0x0706050403020100ull; @@ -2585,14 +2570,14 @@ void check_nbits_in_types(void) // Runtime ordering is little-endian: if(byte_arr[0] == 0 && byte_arr[1] == 1 && byte_arr[2] == 2 && byte_arr[3] == 3 && byte_arr[4] == 4 && byte_arr[5] == 5 && byte_arr[6] == 6 && byte_arr[7] == 7) { #ifdef USE_BIG_ENDIAN - ASSERT(HERE, 0, "USE_BIG_ENDIAN set in platform.h but little-endian detected at runtime!"); + ASSERT(0, "USE_BIG_ENDIAN set in platform.h but little-endian detected at runtime!"); #endif } else if(byte_arr[0] == 7 && byte_arr[1] == 6 && byte_arr[2] == 5 && byte_arr[3] == 4 && byte_arr[4] == 3 && byte_arr[5] == 2 && byte_arr[6] == 1 && byte_arr[7] == 0) { #ifndef USE_BIG_ENDIAN - ASSERT(HERE, 0, "USE_BIG_ENDIAN not set in platform.h but big-endian detected at runtime!"); + ASSERT(0, "USE_BIG_ENDIAN not set in platform.h but big-endian detected at runtime!"); #endif } else { - ASSERT(HERE, 0, "Endianness detected as neither big nor little-endian at runtime!"); + ASSERT(0, "Endianness detected as neither big nor little-endian at runtime!"); } // Init RNG: @@ -2659,10 +2644,10 @@ void check_nbits_in_types(void) #else sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, pi = %20.3f, DNINT(pi ) = %20.3f\n", RND_A, tpi, (double)DNINT(tpi)); - ASSERT(HERE, (double)DNINT(tpi) == 3.0, cbuf); + ASSERT((double)DNINT(tpi) == 3.0, cbuf); sprintf(cbuf,"in check_nbits_in_types: RND_A = %20.3f, ln2 = %20.3f, DNINT(ln2) = %20.3f\n", RND_A, ln2, (double)DNINT(ln2)); - ASSERT(HERE, (double)DNINT(ln2) == 1.0, cbuf); + ASSERT((double)DNINT(ln2) == 1.0, cbuf); #endif @@ -2684,11 +2669,11 @@ to original "fiddle these depending on exponent being tested" scheme. */ FFT_MUL_BASE = (double)((uint64)1 << FFT_MUL_BITS); /* Intend to relax this later to allow powers of 2 as large as 2^54: */ -ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); +ASSERT(((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); - ASSERT(HERE, trailz64((uint64)FFT_MUL_BASE) == FFT_MUL_BITS, "mi64_cvt_double_uint64: trailz64((uint64)FFT_MUL_BASE) != FFT_MUL_BITS"); - ASSERT(HERE, DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_double_uint64: FFT_MUL_BASE not pure-integer!"); - ASSERT(HERE, FFT_MUL_BASE < 1.0*0x8000000*0x8000000, "mi64_cvt_double_uint64: FFT_MUL_BASE >= maximum allowed value of 2^54!"); + ASSERT(trailz64((uint64)FFT_MUL_BASE) == FFT_MUL_BITS, "mi64_cvt_double_uint64: trailz64((uint64)FFT_MUL_BASE) != FFT_MUL_BITS"); + ASSERT(DNINT(FFT_MUL_BASE) == FFT_MUL_BASE, "mi64_cvt_double_uint64: FFT_MUL_BASE not pure-integer!"); + ASSERT(FFT_MUL_BASE < 1.0*0x8000000*0x8000000, "mi64_cvt_double_uint64: FFT_MUL_BASE >= maximum allowed value of 2^54!"); FFT_MUL_BASE_INV = 1.0/FFT_MUL_BASE; #if FAST_UINT32_MOD @@ -2758,32 +2743,32 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); } } printf ("%u cases of %u [%6.2f%%] needed adjustment.\n",nneg,ntry,100.*nneg/(float)ntry); - ASSERT(HERE, nfail == 0, "Fast-uint32-mod test failed for 1 or more inputs!"); + ASSERT(nfail == 0, "Fast-uint32-mod test failed for 1 or more inputs!"); #endif // #if FAST_UINT32_MOD ? /* Test approximate 1/x and 1/sqrt(x) routines: */ - ftmp = finvest(1.5, 8); /*fprintf(stderr, "finvest(1.5, 8) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/ ASSERT(HERE, fabs(ftmp - 0.666666666666667) < 4e-03, "Unacceptable level of error in finvest() call!"); - ftmp = finvest(1.5, 53); /*fprintf(stderr, "finvest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/ ASSERT(HERE, fabs(ftmp - 0.666666666666667) < 1e-14, "Unacceptable level of error in finvest() call!"); - ftmp = finvest(1.0, 53); /*fprintf(stderr, "finvest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/ ASSERT(HERE, fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!"); - ftmp = finvest(2.0, 53); /*fprintf(stderr, "finvest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.500000000000000));*/ ASSERT(HERE, fabs(ftmp - 0.500000000000000) < 1e-14, "Unacceptable level of error in finvest() call!"); - ftmp = finvest(0.5, 53); /*fprintf(stderr, "finvest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/ ASSERT(HERE, fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!"); - ftmp = finvest(.75, 53); /*fprintf(stderr, "finvest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.333333333333333));*/ ASSERT(HERE, fabs(ftmp - 1.333333333333333) < 1e-14, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(1.5, 8); /*fprintf(stderr, "finvest(1.5, 8) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/ ASSERT(fabs(ftmp - 0.666666666666667) < 4e-03, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(1.5, 53); /*fprintf(stderr, "finvest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.666666666666667));*/ ASSERT(fabs(ftmp - 0.666666666666667) < 1e-14, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(1.0, 53); /*fprintf(stderr, "finvest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/ ASSERT(fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(2.0, 53); /*fprintf(stderr, "finvest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.500000000000000));*/ ASSERT(fabs(ftmp - 0.500000000000000) < 1e-14, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(0.5, 53); /*fprintf(stderr, "finvest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/ ASSERT(fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(.75, 53); /*fprintf(stderr, "finvest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.333333333333333));*/ ASSERT(fabs(ftmp - 1.333333333333333) < 1e-14, "Unacceptable level of error in finvest() call!"); /* Try some large and small inputs: */ - ftmp = finvest(3.141592653589793e+15, 53); /*fprintf(stderr, "finvest(3.141592653589793e+15, 53) gives err = %20.10e\n", fabs(ftmp - 3.183098861837907e-16));*/ ASSERT(HERE, fabs(ftmp - 3.183098861837907e-16) < 1e-14, "Unacceptable level of error in finvest() call!"); - ftmp = finvest(3.183098861837907e-16, 53); /*fprintf(stderr, "finvest(3.183098861837907e-16, 53) gives err = %20.10e\n", fabs(ftmp - 3.141592653589793e+15));*/ ASSERT(HERE, fabs(ftmp - 3.141592653589793e+15) < 1e+00, "Unacceptable level of error in finvest() call!"); - - ftmp = fisqrtest(1.5, 8); /*fprintf(stderr, "fisqrtest(1.5, 8) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/ ASSERT(HERE, fabs(ftmp - 0.816496580927726) < 1e-3 , "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(1.5, 53); /*fprintf(stderr, "fisqrtest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/ ASSERT(HERE, fabs(ftmp - 0.816496580927726) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(1.0, 53); /*fprintf(stderr, "fisqrtest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/ ASSERT(HERE, fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(2.0, 53); /*fprintf(stderr, "fisqrtest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.707106781186548));*/ ASSERT(HERE, fabs(ftmp - 0.707106781186548) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(0.5, 53); /*fprintf(stderr, "fisqrtest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 1.414213562373095));*/ ASSERT(HERE, fabs(ftmp - 1.414213562373095) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(0.3, 53); /*fprintf(stderr, "fisqrtest(0.3, 53) gives err = %20.10e\n", fabs(ftmp - 1.825741858350554));*/ ASSERT(HERE, fabs(ftmp - 1.825741858350554) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(.25, 53); /*fprintf(stderr, "fisqrtest(.25, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/ ASSERT(HERE, fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(.75, 53); /*fprintf(stderr, "fisqrtest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.154700538379251));*/ ASSERT(HERE, fabs(ftmp - 1.154700538379251) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(3.0, 53); /*fprintf(stderr, "fisqrtest(3.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.577350269189626));*/ ASSERT(HERE, fabs(ftmp - 0.577350269189626) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = finvest(3.141592653589793e+15, 53); /*fprintf(stderr, "finvest(3.141592653589793e+15, 53) gives err = %20.10e\n", fabs(ftmp - 3.183098861837907e-16));*/ ASSERT(fabs(ftmp - 3.183098861837907e-16) < 1e-14, "Unacceptable level of error in finvest() call!"); + ftmp = finvest(3.183098861837907e-16, 53); /*fprintf(stderr, "finvest(3.183098861837907e-16, 53) gives err = %20.10e\n", fabs(ftmp - 3.141592653589793e+15));*/ ASSERT(fabs(ftmp - 3.141592653589793e+15) < 1e+00, "Unacceptable level of error in finvest() call!"); + + ftmp = fisqrtest(1.5, 8); /*fprintf(stderr, "fisqrtest(1.5, 8) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/ ASSERT(fabs(ftmp - 0.816496580927726) < 1e-3 , "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(1.5, 53); /*fprintf(stderr, "fisqrtest(1.5, 53) gives err = %20.10e\n", fabs(ftmp - 0.816496580927726));*/ ASSERT(fabs(ftmp - 0.816496580927726) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(1.0, 53); /*fprintf(stderr, "fisqrtest(1.0, 53) gives err = %20.10e\n", fabs(ftmp - 1.000000000000000));*/ ASSERT(fabs(ftmp - 1.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(2.0, 53); /*fprintf(stderr, "fisqrtest(2.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.707106781186548));*/ ASSERT(fabs(ftmp - 0.707106781186548) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(0.5, 53); /*fprintf(stderr, "fisqrtest(0.5, 53) gives err = %20.10e\n", fabs(ftmp - 1.414213562373095));*/ ASSERT(fabs(ftmp - 1.414213562373095) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(0.3, 53); /*fprintf(stderr, "fisqrtest(0.3, 53) gives err = %20.10e\n", fabs(ftmp - 1.825741858350554));*/ ASSERT(fabs(ftmp - 1.825741858350554) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(.25, 53); /*fprintf(stderr, "fisqrtest(.25, 53) gives err = %20.10e\n", fabs(ftmp - 2.000000000000000));*/ ASSERT(fabs(ftmp - 2.000000000000000) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(.75, 53); /*fprintf(stderr, "fisqrtest(.75, 53) gives err = %20.10e\n", fabs(ftmp - 1.154700538379251));*/ ASSERT(fabs(ftmp - 1.154700538379251) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(3.0, 53); /*fprintf(stderr, "fisqrtest(3.0, 53) gives err = %20.10e\n", fabs(ftmp - 0.577350269189626));*/ ASSERT(fabs(ftmp - 0.577350269189626) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); /* Try some large and small inputs: */ - ftmp = fisqrtest(3.141592653589793e+15, 53); /*fprintf(stderr, "fisqrtest(3.141592653589793e+15, 53); gives err = %20.10e\n", fabs(ftmp - 1.784124116152771e-08));*/ ASSERT(HERE, fabs(ftmp - 1.784124116152771e-08) < 1e-22, "Unacceptable level of error in fisqrtest() call!"); - ftmp = fisqrtest(3.183098861837907e-16, 53); /*fprintf(stderr, "fisqrtest(3.183098861837907e-16, 53); gives err = %20.10e\n", fabs(ftmp - 5.604991216397928e+07));*/ ASSERT(HERE, fabs(ftmp - 5.604991216397928e+07) < 1e-07, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(3.141592653589793e+15, 53); /*fprintf(stderr, "fisqrtest(3.141592653589793e+15, 53); gives err = %20.10e\n", fabs(ftmp - 1.784124116152771e-08));*/ ASSERT(fabs(ftmp - 1.784124116152771e-08) < 1e-22, "Unacceptable level of error in fisqrtest() call!"); + ftmp = fisqrtest(3.183098861837907e-16, 53); /*fprintf(stderr, "fisqrtest(3.183098861837907e-16, 53); gives err = %20.10e\n", fabs(ftmp - 5.604991216397928e+07));*/ ASSERT(fabs(ftmp - 5.604991216397928e+07) < 1e-07, "Unacceptable level of error in fisqrtest() call!"); /* Now do a whole mess of 'em: */ for(i = 0; i < 100000; i++) @@ -2794,24 +2779,24 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); ftmp = finvest (fran, 53); finv = 1.0/fran; ferr = (ftmp - finv)/(ftmp + finv); - ASSERT(HERE, fabs(ferr) < 1e-14, "Unacceptable level of error in finvest () call!"); + ASSERT(fabs(ferr) < 1e-14, "Unacceptable level of error in finvest () call!"); ftmp = fisqrtest(fran, 53); fsrt = 1.0/sqrt(fran); ferr = (ftmp - fsrt)/(ftmp + fsrt); - ASSERT(HERE, fabs(ferr) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); + ASSERT(fabs(ferr) < 1e-14, "Unacceptable level of error in fisqrtest() call!"); } fran = rng_isaac_rand_double_norm_pos(); if(fran < 0.0 || fran >= 1.0) { sprintf(cbuf, "check_nbits_in_types: rng_isaac_rand_double_norm_pos returns illegal value outside [0, 1): i = %d, %e\n", i,fran); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } fran = rng_isaac_rand_double_norm_pm1(); if(fabs(fran) >= 1.0) { sprintf(cbuf, "check_nbits_in_types: rng_isaac_rand_double_norm_pm1 returns illegal value outside (-1,+1): i = %d, %e\n", i, fran); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } } @@ -2823,7 +2808,7 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); if(!nerr) printf("fma_dmult_tests completed successfully!\n"); else - ASSERT(HERE, 0, "fma_dmult_tests failed!\n"); + ASSERT(0, "fma_dmult_tests failed!\n"); */ #endif @@ -2851,7 +2836,7 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); { order = 1ull << i; prim_root_q(order, &root_re,&root_im); - // printf("FGT: prim-root of order 2^%2u = %llu + I*%llu\n",i, root_re,root_im); + // printf("FGT: prim-root of order 2^%2u = %" PRIu64 " + I*%" PRIu64 "\n",i, root_re,root_im); // Check order-primitivity of roots of order > 1 by powering result up to 2nd order; result must == -1 (mod q): if(i > 0) { for(j = 1; j < i; j++) { @@ -2859,9 +2844,9 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); root_re = qreduce(root_re); root_im = qreduce(root_im); // Only partially reduce intermediates... } root_re = qreduce_finish(root_re); root_im = qreduce_finish(root_im); // ...and then finish reducing here. - ASSERT(HERE, root_re == q-1 && root_im == 0ull, "Bad prim_root_q result!"); + ASSERT(root_re == q-1 && root_im == 0ull, "Bad prim_root_q result!"); } else { - ASSERT(HERE, root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!"); + ASSERT(root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!"); } } @@ -2870,18 +2855,18 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); // Power-of-2 roots satisfy simple conjugate rule, modular analog of complex conj(Re,Im) = (Re,-Im): order = 16; prim_root_q(order, &root_re,&root_im); pow_modq(order-1, root_re,root_im, &re,&im); - printf("FGT: prim-root of order %u = %llu + I*%llu, Conjugate = %llu + I*%llu [q-Im = %llu]\n",(uint32)order, root_re,root_im, re,im,q-im); + printf("FGT: prim-root of order %u = %" PRIu64 " + I*%" PRIu64 ", Conjugate = %" PRIu64 " + I*%" PRIu64 " [q-Im = %" PRIu64 "]\n",(uint32)order, root_re,root_im, re,im,q-im); // FGT: prim-root of order 16 = 1693317751237720973 + I*2283815672160731785, // Conjugate = 1693317751237720973 + I* 22027337052962166 [q-Im = 2283815672160731785] - ASSERT(HERE, root_re == re && root_im == (q-im), "Bad power-of-2 conjugate!"); + ASSERT(root_re == re && root_im == (q-im), "Bad power-of-2 conjugate!"); // Non-power-of-2 roots satisfy no simple conjugate rules, so multiply root and its conjugate together as sanity check: order = 24; prim_root_q(order, &root_re,&root_im); pow_modq(order-1, root_re,root_im, &re,&im); - printf("FGT: prim-root of order %u = %llu + I*%llu, Conjugate = %llu + I*%llu [q-Im = %llu]\n",(uint32)order, root_re,root_im, re,im,q-im); + printf("FGT: prim-root of order %u = %" PRIu64 " + I*%" PRIu64 ", Conjugate = %" PRIu64 " + I*%" PRIu64 " [q-Im = %" PRIu64 "]\n",(uint32)order, root_re,root_im, re,im,q-im); cmul_modq(root_re,root_im, re,im, &re,&im); re = qreduce_full(re); im = qreduce_full(im); - ASSERT(HERE, re == 1ull && im == 0ull, "Bad non-power-of-2 conjugate!"); + ASSERT(re == 1ull && im == 0ull, "Bad non-power-of-2 conjugate!"); /* 24th root: FGT: prim-root of order 24 = 244692701471512749 + I*2061150307742181202, @@ -2895,7 +2880,7 @@ ASSERT(HERE, ((uint64)FFT_MUL_BASE >> 16) == 1, "util.c: FFT_MUL_BASE != 2^16"); printf("Powers of prim-root:\n"); re = root_re; im = root_im; for(i = 0; i < order; i++) { - printf("%2u: %20llu[-= %20llu] + I*%20llu[-= %20llu]\n",i+1, re,q-re,im,q-im); + printf("%2u: %20" PRIu64 "[-= %20" PRIu64 "] + I*%20" PRIu64 "[-= %20" PRIu64 "]\n",i+1, re,q-re,im,q-im); cmul_modq(root_re,root_im, re,im, &re,&im); re = qreduce_full(re); im = qreduce_full(im); } @@ -2943,11 +2928,11 @@ The four [+-d,+-d] and four powers of I are just the eight 8th roots of unity wh { order *= odd_ord_facs[i]; prim_root_q(order, &root_re,&root_im); - // printf("FGT: prim-root of order %llu = %llu + I*%llu\n",order, root_re,root_im); - ASSERT(HERE, root_im == 0ull, "Odd roots must be strictly real!!"); + // printf("FGT: prim-root of order %" PRIu64 " = %" PRIu64 " + I*%" PRIu64 "\n",order, root_re,root_im); + ASSERT(root_im == 0ull, "Odd roots must be strictly real!!"); // Check order-primitivity of roots by raising result to (order)th power; result must == -1 (mod q): pow_modq(order, root_re,root_im, &root_re,&root_im); - ASSERT(HERE, root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!"); + ASSERT(root_re == 1ull && root_im == 0ull, "Bad prim_root_q result!"); } printf("fgt_m61 tests completed successfully!\n"); #endif @@ -3029,7 +3014,7 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562 int i,j,k, pow2; double pow2_dmult,pow2_imult; uint32 nerr = 0, itmp32; - const double crnd = 3.0*0x4000000*0x2000000, crnd50 = crnd*TWO50FLOAT; // Consts used to emulate DNINT(x) and 2^50 * DNINT(x*2^-50) + const double crnd = 3.0*0x4000000*0x2000000; double crnd50 = crnd*TWO50FLOAT; // Consts used to emulate DNINT(x) and 2^50 * DNINT(x*2^-50) // (i.e. round-to-nearest-multiple-of-2^50 ... alas the AVX-512 VRNDSCALEPD instruction only supports // round-to-nearest-multiple-of-negative-power-of-2, and said power is further restricted to pow < 16. static vec_dbl *sc_arr = 0x0; @@ -3037,12 +3022,12 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562 double *tmp, *dptr1,*dptr2,*dptr3,*dptr4, l2lo,l2hi, dblo,dbhi, sqr100lo[4],sqr100hi[4], dtmp,cy_max; static double *ax,*bx,*cx,*dx, *ay,*by,*cy,*dy, *alo,*blo,*clo,*dlo, *ahi,*bhi,*chi,*dhi, *acy,*alo_norm,*ahi_norm; uint64 itmp64, iax,ibx,icx,idx, iay,iby,icy,idy, ialo,iblo,iclo,idlo, iahi,ibhi,ichi,idhi; - const double prod1_adj = 3.0; // Const to multiply by base and add to prod[1] to ensure latter >= 0 + /* const */ double prod1_adj = 3.0; // Const to multiply by base and add to prod[1] to ensure latter >= 0 if(!sc_arr) { sc_arr = ALLOC_VEC_DBL(sc_arr, 8); - if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = (double *)ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((uintptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); /* Remember, rhese are POINTERS-TO-DOUBLES, so need an increment of 4 to span an AVX register: */ tmp = (double *)sc_ptr; ax = tmp + 0; bx = tmp + 1; cx = tmp + 2; dx = tmp + 3; tmp += 4; @@ -3062,7 +3047,7 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562 for(pow2 = 48; pow2 < 54; ++pow2) { // Only makes sense to test up the #bits in an IEEE-double mantissa: Any larger and we start losing // LSBs (I.e. the test may 'succeed' for pow2 > 53, but is only testing the equivalent of pow2 = 53): - ASSERT(HERE, pow2 < 54, "No point testing > 53-bit inputs due to loss of LSBs!"); + ASSERT(pow2 < 54, "No point testing > 53-bit inputs due to loss of LSBs!"); printf("Testing fma_dmult for %d bits, dmult = %f:\n",pow2,pow2_dmult); l2lo = l2hi = cy_max = 0.; // Init log2-range-bounds-storing vars for(j = 0; j < 4; j++) { @@ -3143,25 +3128,25 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562 #endif /* if(pow2 == 53 && i < 100) { - printf("I = %d: ax = %llu ay = %llu ahi,alo = %f,%f\n",i, *ax,*ay, *ahi,*alo); - printf("I = %d: bx = %llu by = %llu bhi,blo = %f,%f\n",i, *bx,*by, *bhi,*blo); - printf("I = %d: cx = %llu cy = %llu chi,clo = %f,%f\n",i, *cx,*cy, *chi,*clo); - printf("I = %d: dx = %llu dy = %llu dhi,dlo = %f,%f\n",i, *dx,*dy, *dhi,*dlo); + printf("I = %d: ax = %" PRIu64 " ay = %" PRIu64 " ahi,alo = %f,%f\n",i, *ax,*ay, *ahi,*alo); + printf("I = %d: bx = %" PRIu64 " by = %" PRIu64 " bhi,blo = %f,%f\n",i, *bx,*by, *bhi,*blo); + printf("I = %d: cx = %" PRIu64 " cy = %" PRIu64 " chi,clo = %f,%f\n",i, *cx,*cy, *chi,*clo); + printf("I = %d: dx = %" PRIu64 " dy = %" PRIu64 " dhi,dlo = %f,%f\n",i, *dx,*dy, *dhi,*dlo); } */ - if(cmp_fma_lohi_vs_exact(*ax,*ay,*ahi,*alo, iax,iay,iahi,ialo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, A-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); } - if(cmp_fma_lohi_vs_exact(*bx,*by,*bhi,*blo, ibx,iby,ibhi,iblo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, B-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); } - if(cmp_fma_lohi_vs_exact(*cx,*cy,*chi,*clo, icx,icy,ichi,iclo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, C-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); } - if(cmp_fma_lohi_vs_exact(*dx,*dy,*dhi,*dlo, idx,idy,idhi,idlo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, D-outputs differ!\n",pow2,i); ASSERT(HERE, 0, "fma_dmult tests failed!"); } + if(cmp_fma_lohi_vs_exact(*ax,*ay,*ahi,*alo, iax,iay,iahi,ialo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, A-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); } + if(cmp_fma_lohi_vs_exact(*bx,*by,*bhi,*blo, ibx,iby,ibhi,iblo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, B-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); } + if(cmp_fma_lohi_vs_exact(*cx,*cy,*chi,*clo, icx,icy,ichi,iclo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, C-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); } + if(cmp_fma_lohi_vs_exact(*dx,*dy,*dhi,*dlo, idx,idy,idhi,idlo)) { ++nerr; printf("ERROR: pow2 = %d, I = %d, D-outputs differ!\n",pow2,i); ASSERT(0, "fma_dmult tests failed!"); } #if 0 #error to-do! double r1,r2, lo,hi; r1 = rng_isaac_rand_double_norm_pm1() * pow2_dmult; // in [-2^50, +2^50] r2 = rng_isaac_rand_double_norm_pm1() * pow2_dmult; // in [-2^50, +2^50] mul50x50_debug(r1,r2, &lo,&hi); - printf("mul50x50_: a,b = %llu, %llu\n",*(uint64*)&r1,*(uint64*)&r2); - printf("mul50x50_: lo = %16llu\n",*(uint64*)alo); - printf("mul50x50_: hi = %16llu\n",*(uint64*)ahi); + printf("mul50x50_: a,b = %" PRIu64 ", %" PRIu64 "\n",*(uint64*)&r1,*(uint64*)&r2); + printf("mul50x50_: lo = %16" PRIu64 "\n",*(uint64*)alo); + printf("mul50x50_: hi = %16" PRIu64 "\n",*(uint64*)ahi); #endif /******************** experimental code: Try squaring [lo,hi] (in ymm1,2), sans intermediate base-normalizations: *******************/ @@ -3240,10 +3225,10 @@ I = 981 Needed extra sub: a = 916753724; p = 11581569; pinv = 370 [a/p = 79.1562 // Use 1.0f as format - .0 means no fractional part, and i/o routines will override the length-1 with actual length: if(cy_max > 0) { itmp64 = cy_max; itmp32 = trailz64(itmp64); itmp64 >>= itmp32; - printf("\tcy_max = %1.0f = %llu * 2^%u\n",cy_max,itmp64,itmp32); + printf("\tcy_max = %1.0f = %" PRIu64 " * 2^%u\n",cy_max,itmp64,itmp32); } else if(cy_max < 0) { itmp64 =-cy_max; itmp32 = trailz64(itmp64); itmp64 >>= itmp32; - printf("\tcy_max = %1.0f = -%llu * 2^%u\n",cy_max,itmp64,itmp32); + printf("\tcy_max = %1.0f = -%" PRIu64 " * 2^%u\n",cy_max,itmp64,itmp32); } else { printf("\tcy_max = 0\n"); } @@ -3317,9 +3302,9 @@ void mul50x50_debug(double a, double b, double *lo, double *hi) if(retval) { printf("In cmp_fma_lohi_vs_exact: FMA-double and pure-int DMUL results differ!\n"); printf("dx = %f; dy = %f; hi,lo = %f,%f\n",dx,dy, dhi * (1 - 2*(s1 != 0)), dlo * (1 - 2*(s0 != 0))); - printf("ix = %lld; iy = %lld; ihi,lo = %lld,%llu\n",ix,iy, ihi,ilo); - printf("Unsigned FMA result: ihi = %llX; ilo = %llX\n",*(uint64*)&dhi,*(uint64*)&dlo); - printf("nsh1,0 = %d,%d: ehi = %llu; elo = %llu [mlo = %c%llu]\n",nsh1,nsh0,exact.d1,exact.d0, char_sgn[s1 ^ s0],m0); + printf("ix = %" PRId64 "; iy = %" PRId64 "; ihi,lo = %" PRId64 ",%" PRIu64 "\n",ix,iy, ihi,ilo); + printf("Unsigned FMA result: ihi = %" PRIX64 "; ilo = %" PRIX64 "\n",*(uint64*)&dhi,*(uint64*)&dlo); + printf("nsh1,0 = %d,%d: ehi = %" PRIu64 "; elo = %" PRIu64 " [mlo = %c%" PRIu64 "]\n",nsh1,nsh0,exact.d1,exact.d0, char_sgn[s1 ^ s0],m0); } return retval; } @@ -3386,7 +3371,7 @@ much except as an auxiliary utility. uint32 reverse(uint32 i, uint32 nbits) { uint32 j, tmp = 0; - ASSERT(HERE,nbits <= 32,"ERROR: bitlength limit 32 exceeded in call to REVERSE.\n"); + ASSERT(nbits <= 32,"ERROR: bitlength limit 32 exceeded in call to REVERSE.\n"); for(j = 0; j < nbits; j++) { tmp += tmp + (i & 1); i >>= 1; @@ -3595,7 +3580,7 @@ int ith_set_bit32(uint32 x, uint32 bit) uint8 curr_byte; int curr_pop,i,j,k,retval = 0; if(!x || !bit) return -1; - ASSERT(HERE, bit <= 32, "[bit]th-bit specifier out of range!"); + ASSERT(bit <= 32, "[bit]th-bit specifier out of range!"); // Find the byte in which the [bit]th set-bit occurs: for(i = 0; i < 32; i += 8) { curr_byte = (uint8)(x >> i); @@ -3619,7 +3604,7 @@ int ith_set_bit64(uint64 x, uint32 bit) uint8 curr_byte; int curr_pop,i,j,k,retval = 0; if(!x || !bit) return -1; - ASSERT(HERE, bit <= 64, "[bit]th-bit specifier out of range!"); + ASSERT(bit <= 64, "[bit]th-bit specifier out of range!"); // Find the byte in which the [bit]th set-bit occurs: for(i = 0; i < 64; i += 8) { curr_byte = (uint8)(x >> i); @@ -3901,7 +3886,7 @@ DEV uint64 getbits64(uint64 x, uint32 src_bit_start, uint32 nbits, uint32 tgt_bi { const uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull; uint64 mask; - ASSERT(HERE, (nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!"); + ASSERT((nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!"); if(nbits == 0) return 0; mask = (ones_mask >> (64-nbits)); return ((x >> src_bit_start) & mask) << tgt_bit_start; @@ -3914,7 +3899,7 @@ DEV void mvbits64(uint64 x, uint32 src_bit_start, uint32 nbits, uint64*y, uint32 { const uint64 ones_mask = 0xFFFFFFFFFFFFFFFFull; uint64 mask; - ASSERT(HERE, (nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!"); + ASSERT((nbits <= 64) && (src_bit_start+nbits <= 64) && (tgt_bit_start < 64), "Illegal bit-index parameters!"); if(nbits == 0) return; mask = (ones_mask >> (64-nbits)); /* Zero out the target bits: */ @@ -3991,7 +3976,7 @@ DEV uint32 is_prime(uint32 n) { // Get nearest Fermat 2-PRP to N in the specified search direction, up or down. Algorithm is slow try-next-odd: DEV uint32 next_prime(uint32 n, int dir) { // direction properly specified? - ASSERT(HERE, ABS(dir) == 1,"next_prime(): Direction of search not properly specified, must = +1 (up) or -1 (down)."); + ASSERT(ABS(dir) == 1,"next_prime(): Direction of search not properly specified, must = +1 (up) or -1 (down)."); // Some special-casing for small n: if(n <= 3 && dir == -1) { return(2*(n == 3)); @@ -4124,7 +4109,7 @@ DEV uint32 twompmodq32(uint32 p, uint32 q) // 2^-p % q int32 j; uint32 lead5, pshift, qhalf, qinv, zshift, start_index, x, lo, hi; - ASSERT(HERE, (q&1) == 1, "twompmodq32: even modulus!"); + ASSERT((q&1) == 1, "twompmodq32: even modulus!"); qhalf = q >> 1; /* = (q-1)/2, since q odd. */ pshift = p + 32; @@ -4166,7 +4151,7 @@ DEV uint32 twompmodq32(uint32 p, uint32 q) // 2^-p % q if((pshift >> j) & (uint32)1) { - DBG_ASSERT(HERE, x < q,"util.c: x < q"); + DBG_ASSERT(x < q,"util.c: x < q"); /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(x > qhalf) { x += x; @@ -4201,7 +4186,7 @@ DEV int twopmodq32(uint32 p, uint32 q) // (2^-p % q) == 0 int32 j; uint32 lead5, pshift, qhalf, qinv, zshift, start_index, x, lo, hi; - ASSERT(HERE, (q&1) == 1, "twopmodq32: even modulus!"); + ASSERT((q&1) == 1, "twopmodq32: even modulus!"); qhalf = q >> 1; /* = (q-1)/2, since q odd. */ pshift = p + 32; if(pshift < p) /* Need special-casing for p just below 2^32 - the primes 2^32-(5,17) are good testcases here. */ @@ -4253,7 +4238,7 @@ DEV int twopmodq32(uint32 p, uint32 q) // (2^-p % q) == 0 if((pshift >> j) & (uint32)1) { - DBG_ASSERT(HERE, x < q,"util.c: x < q"); + DBG_ASSERT(x < q,"util.c: x < q"); /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ if(x > qhalf) { x += x; @@ -4314,7 +4299,7 @@ DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uin uint32 lead6, pshift6, qinv6, zshift6, x6, lo6, hi6, qhalf6; uint32 lead7, pshift7, qinv7, zshift7, x7, lo7, hi7, qhalf7; - DBG_ASSERT(HERE, (q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq32_x8: Inputs nonmonotone!"); + DBG_ASSERT((q0 < q1) && (q1 < q2) && (q2 < q3) && (q3 < q4) && (q4 < q5) && (q5 < q6) && (q6 < q7), "twopmodq32_x8: Inputs nonmonotone!"); qhalf0 = q0 >> 1; /* = (q-1)/2, since q odd. */ qhalf1 = q1 >> 1; @@ -4445,14 +4430,14 @@ DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uin x7 = q7 - lo7; /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ - if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(HERE, x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); } - if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(HERE, x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); } - if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(HERE, x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); } - if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(HERE, x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); } - if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(HERE, x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); } - if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(HERE, x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); } - if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(HERE, x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); } - if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(HERE, x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); } + if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); } + if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); } + if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); } + if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); } + if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); } + if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); } + if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); } + if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); } for(j = start_index-2; j >= 0; j--) { @@ -4493,14 +4478,14 @@ DEV int twopmodq32_x8(uint32 q0, uint32 q1, uint32 q2, uint32 q3, uint32 q4, uin x7 = hi7 - lo7 + ((-(hi7 < lo7)) & q7); /* Combines overflow-on-add and need-to-subtract-q-from-sum checks */ - if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(HERE, x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); } - if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(HERE, x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); } - if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(HERE, x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); } - if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(HERE, x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); } - if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(HERE, x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); } - if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(HERE, x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); } - if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(HERE, x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); } - if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(HERE, x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); } + if((pshift0 >> j) & (uint32)1){ DBG_ASSERT(x0 < q0,"util.c: x0 < q0"); x0 = x0 + x0 - ((-(x0 > qhalf0)) & q0); } + if((pshift1 >> j) & (uint32)1){ DBG_ASSERT(x1 < q1,"util.c: x1 < q1"); x1 = x1 + x1 - ((-(x1 > qhalf1)) & q1); } + if((pshift2 >> j) & (uint32)1){ DBG_ASSERT(x2 < q2,"util.c: x2 < q2"); x2 = x2 + x2 - ((-(x2 > qhalf2)) & q2); } + if((pshift3 >> j) & (uint32)1){ DBG_ASSERT(x3 < q3,"util.c: x3 < q3"); x3 = x3 + x3 - ((-(x3 > qhalf3)) & q3); } + if((pshift4 >> j) & (uint32)1){ DBG_ASSERT(x4 < q4,"util.c: x4 < q4"); x4 = x4 + x4 - ((-(x4 > qhalf4)) & q4); } + if((pshift5 >> j) & (uint32)1){ DBG_ASSERT(x5 < q5,"util.c: x5 < q5"); x5 = x5 + x5 - ((-(x5 > qhalf5)) & q5); } + if((pshift6 >> j) & (uint32)1){ DBG_ASSERT(x6 < q6,"util.c: x6 < q6"); x6 = x6 + x6 - ((-(x6 > qhalf6)) & q6); } + if((pshift7 >> j) & (uint32)1){ DBG_ASSERT(x7 < q7,"util.c: x7 < q7"); x7 = x7 + x7 - ((-(x7 > qhalf7)) & q7); } } /*...Double and return. These are specialized for the case where 2^p == 1 mod q implies divisibility, in which case x = (q+1)/2. */ @@ -4565,9 +4550,9 @@ DEV uint32 egcd32_B(int32 *x, int32 *y) int32 d, e, f; if(*x == *y) { - printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x); ASSERT(HERE, 0,"0"); + printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x); ASSERT(0,"0"); } else if((*x == 0) || (*y == 0)) { - printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y); ASSERT(HERE, 0,"0"); + printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y); ASSERT(0,"0"); } while(w) { @@ -4598,9 +4583,9 @@ DEV uint32 egcd32(uint32 *x, uint32 *y) uint32 d, e, f; if(*x == *y) { - printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x); ASSERT(HERE, 0,"0"); + printf("ERROR: eGCD of identical arguments x = y = %u is illegal!\n", *x); ASSERT(0,"0"); } else if((*x == 0) || (*y == 0)) { - printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y); ASSERT(HERE, 0,"0"); + printf("ERROR: eGCD called with zero input: x = %u, y = %u\n", *x, *y); ASSERT(0,"0"); } while(w) @@ -4638,9 +4623,9 @@ DEV uint64 egcd64(uint64 *x, uint64 *y) /* Sign of these 3 doesn't matter since they're just temporaries: */ uint64 d, e, f; if(*x == *y) { - printf("ERROR: eGCD of identical arguments x = y = %llu is illegal!\n", *x); ASSERT(HERE, 0,"0"); + printf("ERROR: eGCD of identical arguments x = y = %" PRIu64 " is illegal!\n", *x); ASSERT(0,"0"); } else if((*x | *y) == 0ull) { - printf("ERROR: eGCD called with zero input: x = %llu, y = %llu\n", *x, *y); ASSERT(HERE, 0,"0"); + printf("ERROR: eGCD called with zero input: x = %" PRIu64 ", y = %" PRIu64 "\n", *x, *y); ASSERT(0,"0"); } while(w) { q = g/w; @@ -4672,8 +4657,8 @@ int32 x2 = z, y2 = n, gcd2; if(x2 < 0) // since egcd32() only does positive-result normalization on x-output, only do it here to the egcd32_B x-output x2 += n; if(gcd != gcd2 || x != x2 || y != y2) - ASSERT(HERE, 0,"2 gcd results in modinv32 differ!"); - ASSERT(HERE, gcd == 1,"gcd in modinv32 is non-unity!"); + ASSERT(0,"2 gcd results in modinv32 differ!"); + ASSERT(gcd == 1,"gcd in modinv32 is non-unity!"); return x; } @@ -4681,7 +4666,7 @@ DEV int64 modinv64(uint64 z, uint64 n) { uint64 x = z, y = n, gcd; gcd = egcd64(&x, &y); - ASSERT(HERE, gcd == 1ull,"gcd in modinv64 is non-unity!"); + ASSERT(gcd == 1ull,"gcd in modinv64 is non-unity!"); return x; } @@ -4884,7 +4869,7 @@ uint32 x128_div_y32(uint128 *x, uint32 y) cy = (two64mody >> 63); two64mody += (-cy) & y; two64divy += (cy == 0); -/*printf("INIT: two64divy, two64mody = %20llu %20llu\n\n", two64divy, two64mody); */ +/*printf("INIT: two64divy, two64mody = %20" PRIu64 " %20" PRIu64 "\n\n", two64divy, two64mody); */ } /* Divide high digit by y, storing remainder in cy: */ @@ -4902,7 +4887,7 @@ uint32 x128_div_y32(uint128 *x, uint32 y) prior to dividing risks unsigned integer overflow: */ (x->d0) = cy*two64divy + tsum/y + (x->d0)/y; -/*printf("%20llu %20llu %2llu %2llu\n", x->d1, x->d0, cy, rem); */ +/*printf("%20" PRIu64 " %20" PRIu64 " %2" PRIu64 " %2" PRIu64 "\n", x->d1, x->d0, cy, rem); */ return (uint32)rem; } @@ -5350,7 +5335,7 @@ double convert_base10_char_double (const char*char_buf) if(c == '.') /* Found a decimal point */ { - ASSERT(HERE, curr_mul == 0.0,"curr_mul == 0.0"); /* Make sure this is the first . we've encountered */ + ASSERT(curr_mul == 0.0,"curr_mul == 0.0"); /* Make sure this is the first . we've encountered */ curr_mul = 1.0; continue; } @@ -5361,12 +5346,12 @@ double convert_base10_char_double (const char*char_buf) else { fprintf(stderr,"convert_base10_char_double: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c); - ASSERT(HERE, curr_mul == 0.0,"curr_mul == 0.0"); + ASSERT(curr_mul == 0.0,"curr_mul == 0.0"); } } curr_mul *= 0.1; /* Only has an effect if we're to the right of the DP */ curr_digit = (uint64)(c - CHAROFFSET); - ASSERT(HERE, curr_digit < 10,"convert_base10_char_double: curr_digit < 10"); + ASSERT(curr_digit < 10,"convert_base10_char_double: curr_digit < 10"); /* Store 10*currsum in a 128-bit product, so can check for overflow: */ #ifdef MUL_LOHI64_SUBROUTINE MUL_LOHI64((uint64)10,curr_sum,&curr_sum,&hi); @@ -5376,7 +5361,7 @@ double convert_base10_char_double (const char*char_buf) if(hi != 0) { fprintf(stderr, "ERROR: Mul-by-10 overflows in convert_base10_char_double: Offending input string = %s\n", char_buf); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } curr_sum += curr_digit; /* Since currsum now a multiple of 10, adding a single digit at the low end can't overflow */ } @@ -5385,7 +5370,7 @@ double convert_base10_char_double (const char*char_buf) and return that; otherwise we return (double)curr_sum*curr_mul . */ #if 0 - printf("convert_base10_char_double: char_buf = %s, curr_sum = %llu, curr_mul = %lf\n",char_buf, curr_sum, curr_mul); + printf("convert_base10_char_double: char_buf = %s, curr_sum = %" PRIu64 ", curr_mul = %lf\n",char_buf, curr_sum, curr_mul); #endif if(curr_mul == 0.0) { @@ -5435,11 +5420,11 @@ uint64 convert_base10_char_uint64 (const char*char_buf) else { fprintf(stderr,"convert_base10_char_uint64: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } curr_digit = (uint64)(c - CHAROFFSET); - ASSERT(HERE, curr_digit < 10,"convert_base10_char_uint64: curr_digit < 10"); + ASSERT(curr_digit < 10,"convert_base10_char_uint64: curr_digit < 10"); /* Store 10*currsum in a 128-bit product, so can check for overflow: */ #ifdef MUL_LOHI64_SUBROUTINE MUL_LOHI64((uint64)10,curr_sum,&curr_sum,&hi); @@ -5449,7 +5434,7 @@ uint64 convert_base10_char_uint64 (const char*char_buf) if(hi != 0) { fprintf(stderr, "ERROR: Mul-by-10 overflows in convert_base10_char_uint64: Offending input string = %s\n", char_buf); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } curr_sum += curr_digit; /* Since currsum now a multiple of 10, adding a single digit at the low end can't overflow */ } @@ -5503,11 +5488,11 @@ uint128 convert_base10_char_uint128(const char*char_buf) else { fprintf(stderr,"convert_base10_char_uint128: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } curr_digit = (uint64)(c - CHAROFFSET); - ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10"); + ASSERT(curr_digit < 10,"util.c: curr_digit < 10"); /* currsum *= 10, and check for overflow: */ tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len); if(tmp != 0) @@ -5515,13 +5500,13 @@ uint128 convert_base10_char_uint128(const char*char_buf) if(len == LEN_MAX) { fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT128: Offending input string = %s\n", char_buf); - ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(len <= LEN_MAX,"len <= LEN_MAX"); } curr_sum[len++] = tmp; } len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len); - ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(len <= LEN_MAX,"len <= LEN_MAX"); } x128.d0 = curr_sum[0]; @@ -5566,11 +5551,11 @@ uint192 convert_base10_char_uint192(const char*char_buf) else { fprintf(stderr,"convert_base10_char_uint192: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } curr_digit = (uint64)(c - CHAROFFSET); - ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10"); + ASSERT(curr_digit < 10,"util.c: curr_digit < 10"); /* currsum *= 10, and check for overflow: */ tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len); if(tmp != 0) @@ -5578,13 +5563,13 @@ uint192 convert_base10_char_uint192(const char*char_buf) if(len == LEN_MAX) { fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT192: Offending input string = %s\n", char_buf); - ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(len <= LEN_MAX,"len <= LEN_MAX"); } curr_sum[len++] = tmp; } len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len); - ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(len <= LEN_MAX,"len <= LEN_MAX"); } x192.d0 = curr_sum[0]; @@ -5630,11 +5615,11 @@ uint256 convert_base10_char_uint256(const char*char_buf) else { fprintf(stderr,"convert_base10_char_uint256: isdigit(c) fails, s = %s, i = %u, c = %c\n", char_buf, i, c); - ASSERT(HERE, 0,"0"); + ASSERT(0,"0"); } } curr_digit = (uint64)(c - CHAROFFSET); - ASSERT(HERE, curr_digit < 10,"util.c: curr_digit < 10"); + ASSERT(curr_digit < 10,"util.c: curr_digit < 10"); /* currsum *= 10, and check for overflow: */ tmp = mi64_mul_scalar(curr_sum, (uint64)10, curr_sum, len); if(tmp != 0) @@ -5642,13 +5627,13 @@ uint256 convert_base10_char_uint256(const char*char_buf) if(len == LEN_MAX) { fprintf(stderr, "ERROR: Mul-by-10 overflows in CONVERT_BASE10_CHAR_UINT256: Offending input string = %s\n", char_buf); - ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(len <= LEN_MAX,"len <= LEN_MAX"); } curr_sum[len++] = tmp; } len += mi64_add_scalar(curr_sum, curr_digit, curr_sum, len); - ASSERT(HERE, len <= LEN_MAX,"len <= LEN_MAX"); + ASSERT(len <= LEN_MAX,"len <= LEN_MAX"); } x256.d0 = curr_sum[0]; @@ -5730,7 +5715,7 @@ double finvest(double x, uint32 numbits) exp = (itmp >> 52) & MASK_EXP; mant = itmp & MASK_MANT; /* Make sure number is normalized: */ - ASSERT(HERE, exp != 0,"finvest: denormalized inputs illegal!"); + ASSERT(exp != 0,"finvest: denormalized inputs illegal!"); /* Store most-significant 8 non-hidden bits: */ byteval = (mant >> 44) & 0x000000ff; @@ -5770,7 +5755,7 @@ ftmp0 = ftmp; if(fabs(err_num)/fabs(err_den) >= 2e-3) { sprintf(cbuf, "finvtest: ftmp0 too inaccurate! ftmp = %e, ftmp0 = %e, relerr = %e\n", ftmp, ftmp0,fabs(err_num)/fabs(err_den)); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } return ftmp; @@ -5799,7 +5784,7 @@ double fisqrtest(double x, uint32 numbits) exp = (itmp >> 52) & MASK_EXP; mant = itmp & MASK_MANT; /* Make sure number is normalized: */ - ASSERT(HERE, exp != 0,"finvest: denormalized inputs illegal!"); + ASSERT(exp != 0,"finvest: denormalized inputs illegal!"); /* Store most-significant 9 non-hidden bits - we'll use either all or the high 8 of these, depending on the parity of the exponent: */ @@ -5873,7 +5858,7 @@ ftmp0 = ftmp; if(fabs(err_num)/fabs(err_den) >= 2e-3) { sprintf(cbuf, "fisqrtest: ftmp0 too inaccurate! ftmp = %e, ftmp0 = %e, relerr = %e\n", ftmp, ftmp0,fabs(err_num)/fabs(err_den)); - ASSERT(HERE, 0, cbuf); + ASSERT(0, cbuf); } return ftmp; @@ -5932,7 +5917,7 @@ ftmp0 = ftmp; #ifdef USE_AVX1024 int test_simd_transpose_16x16() { - ASSERT(HERE,0,"function not yet supported!"); + ASSERT(0,"function not yet supported!"); return 0; } #endif @@ -5947,7 +5932,7 @@ ftmp0 = ftmp; const int dim = 64; // #elements in our matrix, allocate 2x this to allow for real/imag side-by-side variant vec_dbl *mem = 0x0, *data; mem = ALLOC_VEC_DBL(mem, 2*dim+4); // Add 4 pads to allow for alignment on up-to-128-byte boundary - data = ALIGN_VEC_DBL(mem); ASSERT(HERE, ((long)data & 0x1f) == 0, "data not 32-byte aligned!"); + data = ALIGN_VEC_DBL(mem); ASSERT(((long)data & 0x1f) == 0, "data not 32-byte aligned!"); // Init the matrix - Input matrix has rows containing [0-7][8-15]...[56-63]: double *dptr = (double *)data; for(i = 0; i < dim; i++) { *(dptr+i) = i; } @@ -7092,7 +7077,7 @@ ftmp0 = ftmp; const int dim = 16; // #elements in our matrix vec_dbl *mem = ALLOC_VEC_DBL(mem, dim+4); // Add 4 pads to allow for alignment on up-to-128-byte boundary vec_dbl *data = ALIGN_VEC_DBL(mem); - ASSERT(HERE, ((long)data & 0x1f) == 0, "data not 32-byte aligned!"); + ASSERT(((long)data & 0x1f) == 0, "data not 32-byte aligned!"); // Init the matrix - Input matrix has rows: double *dptr = (double *)data; // 0, 1, 2, 3 for(i = 0; i < dim; i++) { // 4, 5, 6, 7 @@ -7251,9 +7236,9 @@ ftmp0 = ftmp; 13.,15.,31.,16.,-17.,27.,45.,28.,6.,-25.,-24.,15.,-6.,-1.,48.,-57.}; vec_dbl *c_tmp,*s_tmp, *cc0,*two, *r0,*r1,*r2,*r3; // Alloc 8 vector-complex elts (16 vec_dbl) per input/output block rather than 4, so can also test two radix-4 DFTs done side-by-side: - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x42); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x42); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!"); add0 = sc_ptr; add1 = sc_ptr+0x2; add2 = sc_ptr+0x4; @@ -7281,7 +7266,7 @@ ftmp0 = ftmp; VEC_DBL_INIT(c_tmp+6, ran[i+8]); VEC_DBL_INIT(s_tmp+6, ran[i+9]); /* // Restructure twiddle-muls to use cotangent-scheme: - ASSERT(HERE, ran[i+1] != 0.0 && ran[i+9] != 0.0,"Need to modify test-twiddles to avoid div-by-0!"); + ASSERT(ran[i+1] != 0.0 && ran[i+9] != 0.0,"Need to modify test-twiddles to avoid div-by-0!"); VEC_DBL_INIT(c_tmp , ran[i ]/(double)ran[i+1]); VEC_DBL_INIT(s_tmp , ran[i+1]); VEC_DBL_INIT(c_tmp+8, ran[i+8]/(double)ran[i+9]); VEC_DBL_INIT(s_tmp+8, ran[i+9]); */ @@ -7502,7 +7487,7 @@ ftmp0 = ftmp; nerr += (fabs(*(dptr+3) - ref1[j+3]) > 1e-10); dptr += 4; } - ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "Outputs mismatch ref-data!"); // Timing loop #2 - two radix-4 DFTs (operating on separate data chunks but sharing twiddles) side-by-side: /* 6 May 2016, Core2: @@ -7542,7 +7527,7 @@ ftmp0 = ftmp; nerr += (*dptr != ref1[j]) + (*(dptr+1) != ref1[j+1]) + (*(dptr+2) != ref1[j+2]) + (*(dptr+3) != ref1[j+3]); dptr += 4; } - ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "Outputs mismatch ref-data!"); // Timing loop #3 - single radix-4 DIT DFT: dim = 8*RE_IM_STRIDE; // 4 vector-complex data @@ -7565,7 +7550,7 @@ ftmp0 = ftmp; nerr += (*dptr != ref2[j]) + (*(dptr+1) != ref2[j+1]) + (*(dptr+2) != ref2[j+2]) + (*(dptr+3) != ref2[j+3]); dptr += 4; } - ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "Outputs mismatch ref-data!"); // Timing loop #4 - two radix-4 DIT DFTs (operating on separate data chunks but sharing twiddles) side-by-side: for(j = 0; j < dim+dim; j++) { *(add0+j) = ran[j]; } @@ -7588,7 +7573,7 @@ ftmp0 = ftmp; nerr += (*dptr != ref2[j]) + (*(dptr+1) != ref2[j+1]) + (*(dptr+2) != ref2[j+2]) + (*(dptr+3) != ref2[j+3]); dptr += 4; } - ASSERT(HERE, nerr == 0, "Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "Outputs mismatch ref-data!"); free((void *)sc_arr); sc_arr=0x0; return nerr; @@ -7655,9 +7640,9 @@ ftmp0 = ftmp; const int stride = 2*RE_IM_STRIDE, dim = stride<<4; double c1,c2,c3,c4,c5,c6,c7,c8,c9,cA,cB,cC,cD,cE,cF, s1,s2,s3,s4,s5,s6,s7,s8,s9,sA,sB,sC,sD,sE,sF; static double *a,*a_ptr; // Dimension = number of scalar-doubles in 16 vector-complex in SIMD build mode - a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE); if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE); if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } a = ALIGN_VEC_DBL(a_ptr); - ASSERT(HERE, ((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!"); + ASSERT(((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!"); #ifdef USE_SSE2 const int pfetch_dist = 0; int pfetch_addr = 0; // Don't care about pfetch in this lcal-mem context, so just set these = 0 @@ -7665,9 +7650,9 @@ ftmp0 = ftmp; double *add0,*add1,*add2; /* Addresses into array sections */ vec_dbl *c_tmp,*s_tmp, *i0,*i1,*i2,*i3, *o0,*o1,*o2,*o3; static vec_dbl *cc0, *ss0, *isrt2, *two, *r00; - sc_arr = ALLOC_VEC_DBL(sc_arr, 72); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 72); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!"); r00 = sc_ptr + 0x00; isrt2 = sc_ptr + 0x20; cc0 = sc_ptr + 0x21; ss0 = sc_ptr + 0x22; @@ -8152,7 +8137,7 @@ exit(0); #endif } printf("DIF: nerr = %u, ",nerr); - ASSERT(HERE, nerr == 0, "DIF Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "DIF Outputs mismatch ref-data!"); printf("\tSummed roundoff error = %20.10e]\n",avg_err); //******************* Timing loop for Radix-16 DIT transform macro: ******************* @@ -8465,7 +8450,7 @@ exit(0); dtmp = fabs(a[j1+1] - ref2[j2+1]); avg_err += dtmp; if(dtmp > 1e-10){ /*printf("error Im.d0\n");*/ nerr++; }; #endif } - ASSERT(HERE, nerr == 0, "DIT Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "DIT Outputs mismatch ref-data!"); printf("\tSummed roundoff error = %20.10e]\n",avg_err); #ifdef USE_SSE2 @@ -8563,9 +8548,9 @@ exit(0); const int stride = 2*RE_IM_STRIDE, dim = stride<<5, idx[32] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62}; double cc[32],ss[32]; static double *a,*a_ptr; // Dimension = number of scalar-doubles in 16 vector-complex in SIMD build mode - a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE); if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + a_ptr = ALLOC_VEC_DBL(a_ptr, dim/RE_IM_STRIDE); if(!a_ptr){ sprintf(cbuf, "ERROR: unable to allocate a_ptr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } a = ALIGN_VEC_DBL(a_ptr); - ASSERT(HERE, ((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!"); + ASSERT(((long)a & SZ_VDM1) == 0, "a0_ptr not 64-byte aligned!"); #ifdef USE_SSE2 const int pfetch_dist = 0; int pfetch_addr = 0; // Don't care about pfetch in this lcal-mem context, so just set these = 0 @@ -8573,9 +8558,9 @@ exit(0); double *add0; /* Addresses into array sections */ vec_dbl *c_tmp,*s_tmp; static vec_dbl *isrt2,*sqrt2, *cc0, *ss0, *cc1, *ss1, *cc3, *ss3, *one,*two, *r00,*r10,*r20,*r30; - sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); } + sc_arr = ALLOC_VEC_DBL(sc_arr, 0x90); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr in %s.\n",func); fprintf(stderr,"%s", cbuf); ASSERT(0,cbuf); } sc_ptr = ALIGN_VEC_DBL(sc_arr); - ASSERT(HERE, ((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!"); + ASSERT(((long)sc_ptr & SZ_VDM1) == 0, "sc_ptr not 64-byte aligned!"); r00 = sc_ptr; r10 = r00 + 0x10; r20 = r00 + 0x20; @@ -8756,7 +8741,7 @@ exit(0); nerr += (fabs(a[j1+1] - ref1[j2+1]) > 1e-10); #endif } - ASSERT(HERE, nerr == 0, "DIF Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "DIF Outputs mismatch ref-data!"); printf("\tSummed roundoff error = %20.10e]\n",avg_err); #if 0 10^6-timing: setup +=DIF DIF-only @@ -8877,7 +8862,7 @@ exit(0); nerr += (fabs(a[j1+1] - ref2[j2+1]) > 1e-10); #endif } - ASSERT(HERE, nerr == 0, "DIT Outputs mismatch ref-data!"); + ASSERT(nerr == 0, "DIT Outputs mismatch ref-data!"); printf("\tSummed roundoff error = %20.10e]\n",avg_err); #if 0 10^6-timing: setup +=DIF DIF-only @@ -8993,7 +8978,7 @@ exit(0); int ncpu = get_num_cores(), nshift, nextra; printf("Mlucas running as system-created pthread %u, threading self-test will use %d user-created pthreads.\n", (int)pth, nthreads); if(verbose) { - ASSERT(HERE, nthreads > 0,"Mlucas.c: nthreads > 0"); + ASSERT(nthreads > 0,"Mlucas.c: nthreads > 0"); if(nthreads > ncpu) { printf("WARN: Test using more threads[%d] than there are available CPUs[%d].\n", nthreads, ncpu); } @@ -9088,7 +9073,7 @@ exit(0); // 10 sequential iters of test loop yield successive values -1452071552,1390824192,-61247360,-1513318912,1329576832, // -122494720,-1574566272,1268329472,-1837420,-1635813632: - ASSERT(HERE, isum == -1635813632, "retval error!"); + ASSERT(isum == -1635813632, "retval error!"); return 0; } @@ -9123,7 +9108,7 @@ exit(0); int i; /* counter, to print numbers */ int j; /* counter, for delay */ int k = 0; /* accumulator to keep gcc from otimizing away delay-multiply inside test loop */ - ASSERT(HERE, thread_arg != 0x0, "do_loop test function for pthread-test needs live thread_arg pointer!"); + ASSERT(thread_arg != 0x0, "do_loop test function for pthread-test needs live thread_arg pointer!"); #if 0 // BSD thread affinity API barfs in my Mac builds cpuset_t *cset; @@ -9132,7 +9117,7 @@ exit(0); cset = cpuset_create(); if (cset == NULL) { - ASSERT(HERE, 0, "cpuset_create"); + ASSERT(0, "cpuset_create"); } ci = 0; cpuset_set(ci, cset); @@ -9140,7 +9125,7 @@ exit(0); pth = pthread_self(); error = pthread_setaffinity_np(pth, cpuset_size(cset), cset); if (error) { - ASSERT(HERE, 0, "pthread_setaffinity_np"); + ASSERT(0, "pthread_setaffinity_np"); } cpuset_destroy(cset); #endif @@ -9172,20 +9157,20 @@ exit(0); nobjs1 = hwloc_get_nbobjs_by_type (topology, HWLOC_OBJ_CORE); nobjs2 = hwloc_get_nbobjs_by_depth(topology, depth); if(nobjs1 != nobjs2) { - snprintf(cbuf,STR_MAX_LEN,"#objects of type CORE (%d) mismatches #objects (%d) at depth %d (topo depth = %d).",nobjs1,nobjs2,depth,topodepth); - ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"#objects of type CORE (%d) mismatches #objects (%d) at depth %d (topo depth = %d).",nobjs1,nobjs2,depth,topodepth); + ASSERT(0,cbuf); } // Loop over HWLOC_OBJ_CORE objects corr. to index range: for (i = lidx_lo; i <= lidx_hi; i++) { hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i); if (!obj) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i); ASSERT(0,cbuf); } - ASSERT(HERE, obj->type == HWLOC_OBJ_CORE, "[hwloc] Error: Object not of expected type CORE."); + ASSERT(obj->type == HWLOC_OBJ_CORE, "[hwloc] Error: Object not of expected type CORE."); while(obj && (obj->type != HWLOC_OBJ_PACKAGE)) { obj = obj->parent; } - ASSERT(HERE, obj != 0, "[hwloc] Error: PACKAGE Object not found."); + ASSERT(obj != 0, "[hwloc] Error: PACKAGE Object not found."); if(obj->logical_index != socket_idx) { nsockets++; socket_idx = obj->logical_index; @@ -9202,22 +9187,22 @@ exit(0); { int ncpu = 0, lo = -1,hi = lo,incr = 1, i,j,bit,word; char *char_addr = istr, *endp; - ASSERT(HERE, char_addr != 0x0, "Null input-string pointer!"); + ASSERT(char_addr != 0x0, "Null input-string pointer!"); size_t len = strlen(istr); if(len == 0) return 0; // Allow 0-length input, resulting in no-op - ASSERT(HERE, len <= STR_MAX_LEN, "Excessive input-substring length!"); - lo = strtoul(char_addr, &endp, 10); ASSERT(HERE, lo >= 0, "lo-substring not a valid nonnegative number!"); + ASSERT(len <= STR_MAX_LEN, "Excessive input-substring length!"); + lo = strtoul(char_addr, &endp, 10); ASSERT(lo >= 0, "lo-substring not a valid nonnegative number!"); if(*endp) { - ASSERT(HERE, *endp == ':', "Non-colon separator in core-affinity-triplet substring!"); + ASSERT(*endp == ':', "Non-colon separator in core-affinity-triplet substring!"); char_addr = endp+1; hi = strtoul(char_addr, &endp, 10); - ASSERT(HERE, hi >= lo, "hi-substring not a valid number >= lo!"); + ASSERT(hi >= lo, "hi-substring not a valid number >= lo!"); if(*endp) { - ASSERT(HERE, *endp == ':', "Non-colon separator in core-affinity-triplet substring!"); + ASSERT(*endp == ':', "Non-colon separator in core-affinity-triplet substring!"); char_addr = endp+1; incr = strtoul(char_addr, &endp, 10); - ASSERT(HERE, incr > 0, "incr-substring not a valid positive number!"); - ASSERT(HERE, *endp == 0x0, "Non-numeric increment substring in core-affinity-triplet substring!"); + ASSERT(incr > 0, "incr-substring not a valid positive number!"); + ASSERT(*endp == 0x0, "Non-numeric increment substring in core-affinity-triplet substring!"); } else { // If increment (third) argument of triplet omitted, default to incr = 1. } @@ -9241,7 +9226,7 @@ exit(0); hwloc_obj_t obj_core, obj_pu; obj_core = hwloc_get_obj_by_type(hw_topology, HWLOC_OBJ_CORE, i); if (!obj_core) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: HWLOC_OBJ_CORE[%u] not found.\n",i); ASSERT(0,cbuf); } // 2. for each HWLOC_OBJ_CORE object in the above set, verify that it has at least (n) children /* @@ -9252,14 +9237,14 @@ exit(0); '-cpu 0:11', or even more simply '-nthread 12') to use all 12 threads. */ if (obj_core->arity < incr) { - snprintf_nowarn(cbuf,STR_MAX_LEN,"[hwloc] Error: Requested threads_per_core (%u) exceeds arity (%u) of HWLOC_OBJ_CORE[%u].\n",incr,obj_core->arity,i); ASSERT(HERE,0,cbuf); + snprintf(cbuf,STR_MAX_LEN*2,"[hwloc] Error: Requested threads_per_core (%u) exceeds arity (%u) of HWLOC_OBJ_CORE[%u].\n",incr,obj_core->arity,i); ASSERT(0,cbuf); } for (j = 0; j < incr; j++) { obj_pu = obj_core->children[j]; // Set bit = (obj_pu->logical_index) in CORE_SET bitmap, used in thread-affinity setting: bit = obj_pu->logical_index; if(mi64_test_bit(CORE_SET, bit)) { - sprintf(cbuf, "HWLOC_OBJ_PU %d multiply specified in affinity-setting!",bit); ASSERT(HERE, 0, cbuf); + sprintf(cbuf, "HWLOC_OBJ_PU %d multiply specified in affinity-setting!",bit); ASSERT(0, cbuf); } else { mi64_set_bit(CORE_SET, bit, MAX_CORES>>6, 1); #if INCLUDE_HWLOC==2 @@ -9273,8 +9258,8 @@ exit(0); // CPU set encoded by integer-triplet argument corresponds to values of integer loop // index i in the C-loop for(i = lo; i < hi; i += incr), excluding loop-exit value of i: for(i = lo; i <= hi; i += incr, ncpu++) { - word = i>>6; bit = i & 63; ASSERT(HERE, word < MAX_CORES, "Bitmap word exceeds MAX_CORES!"); - if(CORE_SET[word] & (1ull<>6; bit = i & 63; ASSERT(word < MAX_CORES, "Bitmap word exceeds MAX_CORES!"); + if(CORE_SET[word] & (1ull< 0, "Zero input-string length!"); - ASSERT(HERE, len <= STR_MAX_LEN, "Excessive input-string length!"); + ASSERT(len > 0, "Zero input-string length!"); + ASSERT(len <= STR_MAX_LEN, "Excessive input-string length!"); // Clear existing core-affinity bitmap: for(i = 0; i < MAX_CORES>>6; i++) { CORE_SET[i] = 0ull; } // Affinity-triplet substrings are delimited by commas: @@ -9310,7 +9295,7 @@ exit(0); } } printf("\n"); - ASSERT(HERE, nc == ncpu, "Bitmap #set-bits mismatches #cpu!"); + ASSERT(nc == ncpu, "Bitmap #set-bits mismatches #cpu!"); NTHREADS = ncpu; if(NTHREADS > MAX_THREADS) { // Test this first, since if true, it implies truth of the 'else' conditional // fprintf(stderr,"WARN: NTHREADS = %d exceeds number of logical cores = %d ... Affinities for core indices > %d will be set (mod %d).\n",NTHREADS,MAX_THREADS,MAX_THREADS,MAX_THREADS); @@ -9338,7 +9323,7 @@ double get_time(double tdiff) char*get_time_str(double tdiff) { - static char cbuf[STR_MAX_LEN]; + static char cbuf[STR_MAX_LEN*2]; #ifndef MULTITHREAD // In || mode the mod_square routines use getRealTime() to accumulate wall-clock time, thus CLOCKS_PER_SEC not needed tdiff /= CLOCKS_PER_SEC; /* NB: CLOCKS_PER_SEC may be a phony value used to scale clock() ranges */ #endif @@ -9376,7 +9361,7 @@ char *MLUCAS_PATH = ""; On sucess, set_mlucas_path() returns silently On error, set_mlucas_path() prints the cause of error to stderr - and calls ASSERT(HERE, 0, "Exiting."); + and calls ASSERT(0, "Exiting."); possible errors: unable to allocate buffer @@ -9464,7 +9449,7 @@ void set_mlucas_path(void) free(mlucas_path); out_err_check: if (has_err) - ASSERT(HERE, 0, "Exiting."); + ASSERT(0, "Exiting."); } /* Double-quote all spaces in the string pointed by src and write it to dest. @@ -9530,7 +9515,7 @@ int mkdir_p(char *path) fp = popen(cmdstr, "r"); if (fp == NULL) { fprintf(stderr, "ERROR: unable to open pipe fp in mkdir_p()\n"); - ASSERT(HERE, 0, "Exiting."); + ASSERT(0, "Exiting."); } fgets(tmp, STR_MAX_LEN + 1, fp); pclose(fp); @@ -9601,7 +9586,7 @@ FILE *mlucas_fopen(const char *path, const char *mode) */ void mlucas_fprint(char*const cstr, uint32 echo_to_stderr) { - ASSERT(HERE, cstr != 0x0 && strlen(cstr) > 0,"Null string-pointer or empty string supplied to mlucas_fprint!"); + ASSERT(cstr != 0x0 && strlen(cstr) > 0,"Null string-pointer or empty string supplied to mlucas_fprint!"); if(echo_to_stderr) fprintf(stderr,"%s",cstr); if(echo_to_stderr < 2) { @@ -9621,7 +9606,7 @@ double mlucas_getOptVal(const char*fname, char*optname) { const char func[] = "mlucas_getOptVal"; char cstr[STR_MAX_LEN], *cptr,*cadd; - ASSERT(HERE, fname != 0x0 && strlen(fname) > 0,"Null filename-pointer or empty string supplied to mlucas_getOptVal!"); + ASSERT(fname != 0x0 && strlen(fname) > 0,"Null filename-pointer or empty string supplied to mlucas_getOptVal!"); FILE *fptr = mlucas_fopen(fname,"r"); double result = strtod("NaN", 0x0); if(fptr) { diff --git a/src/util.h b/src/util.h index 5b4c8b68..620070ab 100755 --- a/src/util.h +++ b/src/util.h @@ -107,7 +107,7 @@ for(i = 0; i < 256; i++) { for(j = bit ; j < 8; j++) { x32 += 0xf<<(4*j); } - printf("0x%8X,",x32); + printf("%#8X,",x32); } printf("\n"); */ @@ -229,8 +229,12 @@ void WARN (long line, char*file, char*warn_string, char*warn_file, int copy2stde __device__ void ASSERT(long line, char*file, int expr, char*assert_string); #else - void ASSERT (long line, char*file, int expr, char*assert_string); + // void ASSERT (long line, char*file, int expr, char*assert_string); + void _ASSERT(const char*assertion, const char*file, long line, const char*func, bool expr, const char*assert_string); #endif + +#define ASSERT(expr, assert_string) _ASSERT(#expr, __FILE__, __LINE__, __func__, (expr), assert_string) + void VAR_WARN(char *typelist, ...); void byte_bitstr(const uint8 byte, char*ostr);