Skip to content

Commit

Permalink
Fixed pointer-to-int-cast errors.
Browse files Browse the repository at this point in the history
  • Loading branch information
tdulcet committed Feb 19, 2024
1 parent 6a69fd4 commit 4e29ce7
Show file tree
Hide file tree
Showing 49 changed files with 320 additions and 320 deletions.
14 changes: 7 additions & 7 deletions makemake.sh
Original file line number Diff line number Diff line change
Expand Up @@ -197,27 +197,27 @@ if [[ ${#MODES[*]} -eq 1 ]]; then
ARGS+=(-DUSE_AVX512 -march=knl)
;;
'avx512')
echo "Building for avx512 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
echo "Building for AVX512 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
ARGS+=(-DUSE_AVX512 -mavx512f)
;;
'k1om')
echo "Building for 1st-gen Xeon Phi 512-bit SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
ARGS+=(-DUSE_IMCI512)
;;
'avx2')
echo "Building for avx2 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
echo "Building for AVX2 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
ARGS+=(-DUSE_AVX2 -mavx2)
;;
'avx')
echo "Building for avx SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
echo "Building for AVX SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
ARGS+=(-DUSE_AVX -mavx)
;;
'sse2')
echo "Building for sse2 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
echo "Building for SSE2 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
ARGS+=(-DUSE_SSE2 -msse2)
;;
'asimd')
echo "Building for asimd SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
echo "Building for ASIMD SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
ARGS+=(-DUSE_ARM_V8_SIMD)
;;
'nosimd')
Expand Down Expand Up @@ -273,7 +273,7 @@ elif [[ $OSTYPE =~ ^linux ]]; then
ARGS+=(-DUSE_SSE2 -march=native)
elif grep -iq 'asimd' /proc/cpuinfo; then
echo -e "The CPU supports the ASIMD build mode.\n"
ARGS+=(-DUSE_ARM_V8_SIMD -march=native)
ARGS+=(-DUSE_ARM_V8_SIMD) # -march=native
else
echo -e "The CPU supports no Mlucas-recognized SIMD build mode ... building in scalar-double mode.\n"
ARGS+=(-march=native)
Expand Down Expand Up @@ -307,7 +307,7 @@ int main()
#elif defined(__aarch64__)
#ifdef __ARM_NEON
fputs("The CPU supports the ASIMD build mode.\n\n", stderr);
puts("-DUSE_ARM_V8_SIMD -march=native");
puts("-DUSE_ARM_V8_SIMD"); // -march=native
#else
fputs("The CPU supports no Mlucas-recognized SIMD build mode ... building in scalar-double mode.\n\n", stderr);
puts("-march=native");
Expand Down
8 changes: 4 additions & 4 deletions src/Mlucas.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
if(!BASE_MULTIPLIER_BITS) {
j = ((ITERS_BETWEEN_CHECKPOINTS+63) >> 6) + 1; // Add 1 pad element in case compiler does not 64-bit align
BASE_MULTIPLIER_BITS = ALLOC_UINT64(BASE_MULTIPLIER_BITS, j); if(!BASE_MULTIPLIER_BITS){ sprintf(cbuf, "ERROR: unable to allocate BASE_MULTIPLIER_BITS array in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
BASE_MULTIPLIER_BITS = ALIGN_UINT64(BASE_MULTIPLIER_BITS); ASSERT(HERE, ((long)BASE_MULTIPLIER_BITS & 63) == 0x0,"BASE_MULTIPLIER_BITS[] not aligned on 64-byte boundary!");
BASE_MULTIPLIER_BITS = ALIGN_UINT64(BASE_MULTIPLIER_BITS); ASSERT(HERE, ((intptr_t)BASE_MULTIPLIER_BITS & 63) == 0x0,"BASE_MULTIPLIER_BITS[] not aligned on 64-byte boundary!");
for(i = 0; i < j; i++) { BASE_MULTIPLIER_BITS[i] = 0ull; } // v20: Init = 0 here, in case we jump directly into p-1 stage 2 on restart
}

Expand Down Expand Up @@ -1415,9 +1415,9 @@ with the default #threads = 1 and affinity set to logical core 0, unless user ov
}
a_ptmp = ALLOC_DOUBLE(a_ptmp, j*nalloc); if(!a_ptmp){ sprintf(cbuf, "ERROR: unable to allocate array A in main.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
a = ALIGN_DOUBLE(a_ptmp);
ASSERT(HERE, ((long)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
if(((long)a & 127) != 0x0)
fprintf(stderr, "WARN: a[] = 0x%08lX not aligned on 128-byte boundary!\n", (long)a);
ASSERT(HERE, ((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
if(((intptr_t)a & 127) != 0x0)
fprintf(stderr, "WARN: a[] = 0x%08lX not aligned on 128-byte boundary!\n", (intptr_t)a);
// v19: Add three more full-residue arrays to support 2-input FFT-modmul needed for Gerbicz check (and later, p-1 support):
if(use_lowmem < 2) {
b = a + nalloc; c = b + nalloc; d = c + nalloc, e = d + nalloc;
Expand Down
24 changes: 12 additions & 12 deletions src/align.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -38,40 +38,40 @@ util.c::check_nbits_in_types()>
*/

#define ALLOC_INT(_p,_n) (int *)realloc(_p,(_n)*sizeof(int )+256)
#define ALIGN_INT(_p) (int *)(((long)(_p) | 63)+1)
#define ALIGN_INT(_p) (int *)(((intptr_t)(_p) | 63)+1)

#define ALLOC_UINT(_p,_n) (uint *)realloc(_p,(_n)*sizeof(uint )+256)
#define ALIGN_UINT(_p) (uint *)(((long)(_p) | 63)+1)
#define ALIGN_UINT(_p) (uint *)(((intptr_t)(_p) | 63)+1)

#define ALLOC_INT64(_p,_n) (int64 *)realloc(_p,(_n)*sizeof(int64 )+256)
#define ALIGN_INT64(_p) (int64 *)(((long)(_p) | 63)+1)
#define ALIGN_INT64(_p) (int64 *)(((intptr_t)(_p) | 63)+1)

#define ALLOC_UINT64(_p,_n) (uint64 *)realloc(_p,(_n)*sizeof(uint64 )+256)
#define ALIGN_UINT64(_p) (uint64 *)(((long)(_p) | 63)+1)
#define ALIGN_UINT64(_p) (uint64 *)(((intptr_t)(_p) | 63)+1)

#define ALLOC_UINT128(_p,_n)(uint128 *)realloc(_p,(_n+_n)*sizeof(uint64 )+256)
#define ALIGN_UINT128(_p) (uint128 *)(((long)(_p) | 63)+1)
#define ALIGN_UINT128(_p) (uint128 *)(((intptr_t)(_p) | 63)+1)

#define ALLOC_FLOAT(_p,_n) (float *)realloc(_p,(_n)*sizeof(float )+256)
#define ALIGN_FLOAT(_p) (float *)(((long)(_p) | 63)+1)
#define ALIGN_FLOAT(_p) (float *)(((intptr_t)(_p) | 63)+1)

#define ALLOC_DOUBLE(_p,_n) (double *)realloc(_p,(_n)*sizeof(double )+512)
#define ALIGN_DOUBLE(_p) (double *)(((long)(_p) | 127)+1)
#define ALIGN_DOUBLE(_p) (double *)(((intptr_t)(_p) | 127)+1)

#define ALLOC_f128(_p,_n) (__float128 *)realloc(_p,(_n)*sizeof(__float128 )+512)
#define ALIGN_f128(_p) (__float128 *)(((long)(_p) | 127)+1)
#define ALIGN_f128(_p) (__float128 *)(((intptr_t)(_p) | 127)+1)

#define ALLOC_COMPLEX(_p,_n)(struct complex*)realloc(_p,(_n)*sizeof(struct complex)+512)
#define ALIGN_COMPLEX(_p) (struct complex*)(((long)(_p) | 127)+1)
#define ALIGN_COMPLEX(_p) (struct complex*)(((intptr_t)(_p) | 127)+1)

// Vector-double|uint64-alloc used by SIMD builds; register size difference between YMM and XMM taken care of by def of vec_dbl in types.h:
#ifdef USE_SSE2

#define ALLOC_VEC_DBL(_p,_n)(vec_dbl*)realloc(_p,(_n)*sizeof(vec_dbl)+512)
#define ALIGN_VEC_DBL(_p) (vec_dbl*)(((long)(_p) | 127)+1)
#define ALIGN_VEC_DBL(_p) (vec_dbl*)(((intptr_t)(_p) | 127)+1)

#define ALLOC_VEC_U64(_p,_n)(vec_u64*)realloc(_p,(_n)*sizeof(vec_u64)+512)
#define ALIGN_VEC_U64(_p) (vec_u64*)(((long)(_p) | 127)+1)
#define ALIGN_VEC_U64(_p) (vec_u64*)(((intptr_t)(_p) | 127)+1)

#else // In scalar-mode simply use the above double|uint64 macros:

Expand All @@ -84,7 +84,7 @@ util.c::check_nbits_in_types()>
#endif

#define ALLOC_POINTER(_p,_ptr_type,_n)(_ptr_type*)realloc(_p,(_n)*sizeof(_ptr_type)+64)
#define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((long)(_p) | 63)+1)
#define ALIGN_POINTER(_p,_ptr_type) (_ptr_type*)(((intptr_t)(_p) | 63)+1)

#define ALLOC_QFLOAT(_p,_n) ALLOC_UINT128(_p,_n)
#define ALIGN_QFLOAT(_p) ALIGN_UINT128(_p)
Expand Down
8 changes: 4 additions & 4 deletions src/dft_macro.c
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -3407,7 +3407,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
// 126 slots for DFT-63 data, 22 for DFT-7,9 consts and DFT-7 pads, 4 to allow for alignment = 152:
sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
sc_ptr = ALIGN_VEC_DBL(sc_arr);
ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");

#ifdef MULTITHREAD
__r0 = tdat = sc_ptr; tmp = tdat + 126;
Expand Down Expand Up @@ -3656,7 +3656,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
// 126 slots for DFT-63 data, 22 for DFT-7,9 consts and DFT-7 pads, 4 to allow for alignment = 152:
sc_arr = ALLOC_VEC_DBL(sc_arr, 152*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
sc_ptr = ALIGN_VEC_DBL(sc_arr);
ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");

#ifdef MULTITHREAD
__r0 = tdat = sc_ptr; tmp = tdat + 126;
Expand Down Expand Up @@ -3887,7 +3887,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
if(sc_arr) { free((void *)sc_arr); }
sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
sc_ptr = ALIGN_VEC_DBL(sc_arr);
ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");

#ifdef MULTITHREAD
__r0 = tmp = sc_ptr;
Expand Down Expand Up @@ -4290,7 +4290,7 @@ in the same order here as DIF, but the in-and-output-index offsets are BRed: j1
if(sc_arr) { free((void *)sc_arr); }
sc_arr = ALLOC_VEC_DBL(sc_arr, 0x32*max_threads); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
sc_ptr = ALIGN_VEC_DBL(sc_arr);
ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");

#ifdef MULTITHREAD
__r0 = tmp = sc_ptr;
Expand Down
8 changes: 4 additions & 4 deletions src/mi64.c
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -769,10 +769,10 @@ uint64 mi64_shl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
2. if x,y have different 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, skip the ASM-loop;
3. if x,y have same 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, find i0 >= 1 such that x[i0] is SIMD-aligned.
*/
if( ((uint32)x & 0x7) != 0 || ((uint32)y & 0x7) != 0 )
if( ((intptr_t)x & 0x7) != 0 || ((intptr_t)y & 0x7) != 0 )
ASSERT(HERE, 0, "require 8-byte alignment of x,y!");
// In SIMD-ASM case, x_misalign = (0,1,2, or 3) how many words x[0] is above next-lower alignment boundary:
x_misalign = ((uint32)x & BASEADDRMASK)>>3; y_misalign = ((uint32)y & BASEADDRMASK)>>3;
x_misalign = ((intptr_t)x & BASEADDRMASK)>>3; y_misalign = ((intptr_t)y & BASEADDRMASK)>>3;

if(len >= minlen) { // Low-end clean-up loop runs from i = i0 downward thru i = 1 ... x[0] handled separately:
#ifdef USE_AVX2
Expand Down Expand Up @@ -1222,9 +1222,9 @@ uint64 mi64_shrl_short(const uint64 x[], uint64 y[], uint32 nshift, uint32 len)
2. if x,y have different 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, skip the ASM-loop;
3. if x,y have same 16-byte[SSE2/AVX] or 32-byte[AVX2] alignment, find i0 >= 0 such that x[i0] is SIMD-aligned.
*/
if( ((uint32)x & 0x7) != 0 || ((uint32)y & 0x7) != 0 )
if( ((intptr_t)x & 0x7) != 0 || ((intptr_t)y & 0x7) != 0 )
ASSERT(HERE, 0, "require 8-byte alignment of x,y!");
x_misalign = ((uint32)x & BASEADDRMASK)>>3; y_misalign = ((uint32)y & BASEADDRMASK)>>3;
x_misalign = ((intptr_t)x & BASEADDRMASK)>>3; y_misalign = ((intptr_t)y & BASEADDRMASK)>>3;

// minlen may have been incr. for alignment purposes, so use_asm not an unconditional TRUE here
if(len >= minlen && x_misalign != 0) { // Low-end clean-up loop runs from i = 0 upward thru i = i0-1
Expand Down
14 changes: 7 additions & 7 deletions src/pm1.c
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -1139,13 +1139,13 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
sprintf(cbuf, "ERROR: unable to allocate the needed %u buffers of p-1 Stage 2 storage.\n",num_b*m + use_pp1);
mlucas_fprint(cbuf,pm1_standlone+1); ASSERT(HERE, 0,cbuf);
}
a = ALIGN_DOUBLE(a_ptmp); ASSERT(HERE, ((long)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
a = ALIGN_DOUBLE(a_ptmp); ASSERT(HERE, ((intptr_t)a & 63) == 0x0,"a[] not aligned on 64-byte boundary!");
buf = (double **)calloc(num_b*m,sizeof(double *));
// ...and num_b*m "buffers" for precomputed bigstep-coprime odd-square powers of the stage 1 residue:
for(i = 0; i < num_b*m; i++) {
buf[i] = a + i*npad;
// fprintf(stderr,"buf[%3d] = 0x%llX\n",i,(uint64)buf[i]);
ASSERT(HERE, ((long)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
ASSERT(HERE, ((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
}
// Still do fwdFFT(1) as init-FFT step in non-(p+1) build, but use uppermost buf[] entry to hold as throwaway result:
vone = a + (i - 1 + use_pp1)*npad;
Expand All @@ -1168,8 +1168,8 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
const int nbytes_simd_align = (RE_IM_STRIDE*8) - 1; // And per-thread data chunk addresses with this to check SIMD alignment
ASSERT(HERE, ((long)mult[0] & nbytes_simd_align) == 0x0,"mult[0] not aligned on 64-byte boundary!");
ASSERT(HERE, ((long)buf [0] & nbytes_simd_align) == 0x0,"buf [0] not aligned on 64-byte boundary!"); // Since npad a multiple of RE_IM_STRIDE, only need to check buf[0] alignment
ASSERT(HERE, ((intptr_t)mult[0] & nbytes_simd_align) == 0x0,"mult[0] not aligned on 64-byte boundary!");
ASSERT(HERE, ((intptr_t)buf [0] & nbytes_simd_align) == 0x0,"buf [0] not aligned on 64-byte boundary!"); // Since npad a multiple of RE_IM_STRIDE, only need to check buf[0] alignment
j = npad / NTHREADS; // j = #doubles in each thread-processed chunk
/* Fiddle up-or-downward to make it a multiple of RE_IM_STRIDE; say this == 8. Since j == (npad/NTHREADS) - [0 or 1]
due to truncation-on-integer-div, if jmod := (j % RE_IM_STRIDE) < RE_IM_STRIDE/2, subtract jmod from j, otherwise
Expand All @@ -1185,7 +1185,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
tdat[i].retval = &thr_ret[i];
tdat[i].arr0 = a + k; // a[]-array (alias for mult[3]) takes output, a[] = (mult[0][] - buf[i][])
tdat[i].arr1 = mult[0] + k; // k = i*j = doubles-offset for this thread's pair of array pointers
tdat[i].arr2 = (double *)(long)k; // For array-pointer 2, init the fixed offsets, then add fixed base-pointer offset buf[i] to
tdat[i].arr2 = (double *)(intptr_t)k; // For array-pointer 2, init the fixed offsets, then add fixed base-pointer offset buf[i] to
// each k-index offset at thread-dispatch time, re-subtract buf[i] after pool work completion
tdat[i].n = j; // Chunksize
}
Expand Down Expand Up @@ -1414,7 +1414,7 @@ based on iteration count versus PM1_S1_PROD_BITS as computed from the B1 bound,
fprintf(stderr,"%u^2.",j);
#endif
// fprintf(stderr,"buf[%3d] = 0x%llX\n",i,(uint64)buf[i]);
ASSERT(HERE, ((long)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
ASSERT(HERE, ((intptr_t)(buf[i]) & 63) == 0x0,"buf[i] not aligned on 64-byte boundary!");
memcpy(buf[i++],mult[0],nbytes); // buf[i++] = mult[0] = fwd-FFT-pass-1-done(A^1,9,25,...)
}
// Up-multiply the fwd-FFT-pass-1-done(A^8,16,24,...) by fixed multiplier fwd-FFT(A^8):
Expand Down Expand Up @@ -2559,7 +2559,7 @@ MME = 0;
// Add fixed-offset represented by the address of the subtrahend-array c[] to each
// precomputed datachunk offset index. Pointer arithmetic takes case of the *= 8 scaling,
// but first cast index stored in tdat[i].arr2 to int to avoid illegal operation addition of pointers:
tdat[i].arr2 = c + (long)(tdat[i].arr2);
tdat[i].arr2 = c + (intptr_t)(tdat[i].arr2);
task_control.data = (void*)(&tdat[i]);
// printf("adding pool task %d\n",i);
threadpool_add_task(tpool, &task_control, task_is_blocking);
Expand Down
16 changes: 8 additions & 8 deletions src/radix1008_ditN_cy_dif1.c
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[

// This array pointer must be set based on vec_dbl-sized alignment at runtime for each thread:
for(l = 0; l < RE_IM_STRIDE; l++) {
if( ((long)&tdat[ithread].cy_dat[l] & SZ_VDM1) == 0 ) {
if( ((intptr_t)&tdat[ithread].cy_dat[l] & SZ_VDM1) == 0 ) {
tdat[ithread].cy_r = &tdat[ithread].cy_dat[l];
tdat[ithread].cy_i = tdat[ithread].cy_r + RADIX;
// fprintf(stderr,"%d-byte-align cy_dat array at element[%d]\n",SZ_VD,l);
Expand All @@ -524,18 +524,18 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[

#ifdef USE_SSE2

ASSERT(HERE, ((long)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!");
ASSERT(HERE, ((long)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)wt0 & 0x3f) == 0, "wt0[] not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)wt1 & 0x3f) == 0, "wt1[] not 64-byte aligned!");

// Use vector-double type size (16 bytes for SSE2, 32 for AVX) to alloc a block of local storage
// consisting of 128*2 vec_dbl and (8+RADIX/2) uint64 element slots per thread
// (Add as many padding elts to the latter as needed to make it a multiple of 4):
cslots_in_local_store = radix1008_creals_in_local_store + (((12+RADIX/2)/2 + 3) & ~0x3);
sc_arr = ALLOC_VEC_DBL(sc_arr, cslots_in_local_store*CY_THREADS); if(!sc_arr){ sprintf(cbuf, "ERROR: unable to allocate sc_arr!.\n"); fprintf(stderr,"%s", cbuf); ASSERT(HERE, 0,cbuf); }
sc_ptr = ALIGN_VEC_DBL(sc_arr);
ASSERT(HERE, ((long)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)sc_ptr & 0x3f) == 0, "sc_ptr not 64-byte aligned!");
sm_ptr = (uint64*)(sc_ptr + radix1008_creals_in_local_store);
ASSERT(HERE, ((long)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");
ASSERT(HERE, ((intptr_t)sm_ptr & 0x3f) == 0, "sm_ptr not 64-byte aligned!");

#ifdef USE_PTHREAD
__r0 = sc_ptr;
Expand Down Expand Up @@ -574,7 +574,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
} else {
j = ODD_RADIX<<2; // 4*ODD_RADIX
}
ASSERT(HERE, (radix1008_creals_in_local_store << L2_SZ_VD) >= ((long)half_arr - (long)r00) + (j << L2_SZ_VD), "radix1008_creals_in_local_store checksum failed!");
ASSERT(HERE, (radix1008_creals_in_local_store << L2_SZ_VD) >= ((intptr_t)half_arr - (intptr_t)r00) + (j << L2_SZ_VD), "radix1008_creals_in_local_store checksum failed!");

// Roots for radix-16 DFTs:
VEC_DBL_INIT(two , 2.0 ); VEC_DBL_INIT(one , 1.0 );
Expand All @@ -589,7 +589,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
thr_id = 0; // ...then revert to 0.

// Propagate the above consts to the remaining threads:
nbytes = (long)cy_r - (long)two; // #bytes in 1st of above block of consts
nbytes = (intptr_t)cy_r - (intptr_t)two; // #bytes in 1st of above block of consts
tmp = two;
tm2 = tmp + cslots_in_local_store;
for(ithread = 1; ithread < CY_THREADS; ++ithread) {
Expand Down Expand Up @@ -1254,7 +1254,7 @@ int radix1008_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[
tdat[ithread].bjmodn0 = _bjmodnini[ithread];
#ifdef USE_SSE2
tdat[ithread].r00 = __r0 + ithread*cslots_in_local_store;
tdat[ithread].half_arr = (vec_dbl *)((long)tdat[ithread].r00 + ((long)half_arr - (long)r00));
tdat[ithread].half_arr = (vec_dbl *)((intptr_t)tdat[ithread].r00 + ((intptr_t)half_arr - (intptr_t)r00));
#else
// In scalar mode use these 2 ptrs to pass wts_idx_incr and the base/baseinv/etc array-ptrs:
tdat[ithread].r00 = (double *)foo_array;
Expand Down
Loading

0 comments on commit 4e29ce7

Please sign in to comment.