diff --git a/src/radix48_ditN_cy_dif1.c b/src/radix48_ditN_cy_dif1.c
index f0d50cba..5bc5d581 100755
--- a/src/radix48_ditN_cy_dif1.c
+++ b/src/radix48_ditN_cy_dif1.c
@@ -143,38 +143,57 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 !   storage scheme, and radix8_ditN_cy_dif1 for details on the reduced-length weights array scheme.
 */
 	const char func[] = "radix48_ditN_cy_dif1";
+#if !defined(MULTITHREAD) && defined(USE_SSE2)
 	const int pfetch_dist = PFETCH_DIST;
+#endif
 	const int stride = (int)RE_IM_STRIDE << 1;	// main-array loop stride = 2*RE_IM_STRIDE
 #ifdef USE_SSE2
   #if COMPACT_OBJ
 	static uint32 plo[16];
+   #ifndef MULTITHREAD
 	int k0,k1,k2,k3,k4,k5,k6,k7,k8,k9,ka,kb,kc,kd,ke,kf;
+   #endif
   #endif
+  #ifndef MULTITHREAD
 	int po_kperm[16],*po_ptr = &(po_kperm[0]);
+  #endif
 #else
 	static uint32 p0123[4];
 #endif
 	static double wts_mult[2], inv_mult[2];	// Const wts-multiplier and 2*(its multiplicative inverse)
+  #if !defined(MULTITHREAD) && !defined(USE_SSE2)
 	double wt_re,wt_im, wi_re,wi_im;	// Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
+  #endif
   #ifdef USE_AVX512
 	const int jhi_wrap = 15;
   #else
 	const int jhi_wrap =  7;
   #endif
-	int NDIVR,i,incr,j,j1,j2,jt,jp,jstart,jhi,full_pass,k,khi,l,ntmp,outer,nbytes;
+	int NDIVR,i,j,j1,jt,jhi,full_pass,khi,l,outer;
+  #if !defined(MULTITHREAD) && !defined(USE_SSE2)
+	int j2,jp;
+  #endif
+  #ifndef MULTITHREAD
+	int jstart;
+  #endif
+  #if !defined(MULTITHREAD) && (!defined(USE_SSE2) || defined(USE_AVX512))
+	int ntmp;
+  #endif
+  #ifdef USE_SSE2
+	int nbytes;
+  #endif
+  #if !defined(MULTITHREAD) && defined(USE_SSE2)
+	int incr;
 	// incr = Carry-chain wts-multipliers recurrence length;
 	//incr must divide RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = 3|6|12 for avx512,avx,sse, respectively:
-  #ifdef USE_SSE2	// For RADIX = 48, the only SIMD mode for which recurrence is possible is SSE2, others are don't-care
+	// For RADIX = 48, the only SIMD mode for which recurrence is possible is SSE2, others are don't-care
 	const int incr_long = 12,incr_med = 6,incr_short = 4;
-  #else
-	const int incr_long = 0,incr_med = 0,incr_short = 0;
-  #endif
   // Have no specialized HIACC carry macro in USE_AVX512 and ARMv8 SIMD, use nonzero incr-value to differenetiate vs AVX/AVX2:
-  #if defined(USE_AVX512) || defined(USE_ARM_V8_SIMD)
+   #if defined(USE_AVX512) || defined(USE_ARM_V8_SIMD)
 	const int incr_hiacc = 2;
-  #else
+   #else
 	const int incr_hiacc = 0;
-  #endif
+   #endif
 	// Allows cy-macro error data to be used to fiddle incr on the fly to a smaller, safer value if necessary
 	if(USE_SHORT_CY_CHAIN == 0)
 		incr = incr_long;
@@ -184,6 +203,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		incr = incr_short;
 	else
 		incr = incr_hiacc;
+  #endif
 
 	// Jun 2018: Add support for residue shift. (Only LL-test needs intervention at carry-loop level).
 	int target_idx = -1, target_set = 0,tidx_mod_stride;
@@ -198,18 +218,29 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 	const double	c3m1= -1.50000000000000000000,	/* cos(twopi/3)-1	*/
 					s   =  0.86602540378443864675;	/* sin(twopi/3)		*/
 #endif
+  #ifndef MULTITHREAD
 	double *addr;
+  #endif
 	double scale, dtmp, maxerr = 0.0;
 	// Local storage: We must use an array here because scalars have no guarantees about relative address offsets
 	// [and even if those are contiguous-as-hoped-for, they may run in reverse]; Make array type (struct complex)
 	// to allow us to use the same offset-indexing as in the original radix-32 in-place DFT macros:
-	struct complex t[RADIX], *tptr;
-	int *itmp,*itm2;	// Pointer into the bjmodn array
+	struct complex t[RADIX];
+  #if !defined(MULTITHREAD) && !defined(USE_SSE2)
+	struct complex *tptr;
+  #endif
+  #ifndef MULTITHREAD
+	int *itmp;	// Pointer into the bjmodn array
+  #endif
+  #if !defined(MULTITHREAD) && defined(USE_AVX) && !defined(USE_AVX512)
+	int *itm2;	// Pointer into the bjmodn array
+  #endif
 	int err;
 	static int first_entry=TRUE;
 
 /*...stuff for the reduced-length DWT weights array is here:	*/
 	int n_div_nwt;
+  #ifndef MULTITHREAD
 	int col,co2,co3;
   #ifdef USE_AVX512
 	double t0,t1,t2,t3;
@@ -224,6 +255,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 	int n_minus_sil,n_minus_silp1,sinwt,sinwtm1;
 	double wtl,wtlp1,wtn,wtnm1;	/* Mersenne-mod weights stuff */
   #endif
+  #endif
 
 #ifdef USE_SSE2
 
@@ -239,27 +271,44 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
   #ifdef MULTITHREAD
 	static vec_dbl *__r0;	// Base address for discrete per-thread local stores
   #else
-	double *add0,*add1,*add2,*add3,*add4,*add5,*add6,*add7,*add8,*add9,*adda,*addb,*addc,*addd,*adde,*addf;	/* Addresses into array sections */
+	double *add0,*add1,*add2,*add3;	/* Addresses into array sections */
   #endif
 
 	// Uint64 bitmaps for alternate "rounded the other way" copies of sqrt2,isrt2. Default round-to-nearest versions
 	// (SQRT2, ISRT2) end in ...3BCD. Since we round these down as ...3BCC90... --> ..3BCC, append _dn to varnames:
 	const uint64 sqrt2_dn = 0x3FF6A09E667F3BCCull, isrt2_dn = 0x3FE6A09E667F3BCCull;
+  #ifndef MULTITHREAD
 	static int *bjmodn;	// Alloc mem for this along with other 	SIMD stuff
+  #endif
+  #ifndef USE_AVX512
 	const double crnd = 3.0*0x4000000*0x2000000;
+  #endif
+  #ifndef USE_AVX
 	struct complex *ctmp;	// Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
-	vec_dbl *tmp,*tm1,*tm2;	// Non-static utility ptrs
+  #endif
+	vec_dbl *tmp,*tm2;	// Non-static utility ptrs
+  #ifndef MULTITHREAD
+	vec_dbl *tm1;	// Non-static utility ptrs
+  #endif
 	static vec_dbl *sqrt2,*isrt2,*two,*one, *cc0, *ss0, *cc1, *ss1, *max_err, *sse2_rnd, *half_arr,
-		*r00r,*r01r,*r02r,*r03r,*r04r,*r05r,*r06r,*r07r,
+		*r00r,
+	  #ifndef MULTITHREAD
+	   #if !COMPACT_OBJ
+		*r01r,*r02r,*r03r,*r04r,*r05r,*r06r,*r07r,
 		*r08r,*r09r,*r10r,*r11r,*r12r,*r13r,*r14r,*r15r,
 		*r16r,*r17r,*r18r,*r19r,*r20r,*r21r,*r22r,*r23r,
 		*r24r,*r25r,*r26r,*r27r,*r28r,*r29r,*r30r,*r31r,
 		*r32r,*r33r,*r34r,*r35r,*r36r,*r37r,*r38r,*r39r,
 		*r40r,*r41r,*r42r,*r43r,*r44r,*r45r,*r46r,*r47r,
-		*s1p00r,*s1p01r,*s1p02r,*s1p03r,*s1p04r,*s1p05r,*s1p06r,*s1p07r,*s1p08r,*s1p09r,*s1p10r,*s1p11r,
+	   #endif
+		*s1p00r,
+	   #if !COMPACT_OBJ
+		*s1p01r,*s1p02r,*s1p03r,*s1p04r,*s1p05r,*s1p06r,*s1p07r,*s1p08r,*s1p09r,*s1p10r,*s1p11r,
 		*s1p12r,*s1p13r,*s1p14r,*s1p15r,*s1p16r,*s1p17r,*s1p18r,*s1p19r,*s1p20r,*s1p21r,*s1p22r,*s1p23r,
 		*s1p24r,*s1p25r,*s1p26r,*s1p27r,*s1p28r,*s1p29r,*s1p30r,*s1p31r,*s1p32r,*s1p33r,*s1p34r,*s1p35r,
 		*s1p36r,*s1p37r,*s1p38r,*s1p39r,*s1p40r,*s1p41r,*s1p42r,*s1p43r,*s1p44r,*s1p45r,*s1p46r,*s1p47r,
+	   #endif
+	  #endif
 		*cy;	// Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
 #endif
 
@@ -267,7 +316,10 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	static struct cy_thread_data_t *tdat = 0x0;
 	// Threadpool-based dispatch stuff:
-	static int main_work_units = 0, pool_work_units = 0;
+  #if 0//def OS_TYPE_MACOSX
+	static int main_work_units = 0;
+  #endif
+	static int pool_work_units = 0;
 	static struct threadpool *tpool = 0x0;
 	static int task_is_blocking = TRUE;
 	static thread_control_t thread_control = {0,0,0};
@@ -281,7 +333,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 	int m,m2;
 	double wt,wtinv,wtA,wtB,wtC;	/* Mersenne-mod weights stuff */
 	int bjmodn[RADIX];
-	double rt,it,temp,frac,cy[RADIX],
+	double temp,frac,cy[RADIX],
 			t00,t01,t02,t03,t04,t05;
 
 #endif
@@ -302,8 +354,10 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		ASSERT(0, "Fermat-mod only available for radices 7,8,9,15 and their multiples!");
 	}
 
+  #ifndef MULTITHREAD
 	// Init these to get rid of GCC "may be used uninitialized in this function" warnings:
 	col=co2=co3=-1;
+  #endif
 	// Jan 2018: To support PRP-testing, read the LR-modpow-scalar-multiply-needed bit for the current iteration from the global array:
 	double prp_mult = 1.0;
 	if((TEST_TYPE & 0xfffffffe) == TEST_TYPE_PRP) {	// Mask off low bit to lump together PRP and PRP-C tests
@@ -483,7 +537,9 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		__r0 = sc_ptr;
 	#endif
 		r00r = sc_ptr;			tmp = r00r + 0x60;
-		r00r = r00r + 0x00;		s1p00r = tmp + 0x00;
+		r00r = r00r + 0x00;
+	#ifndef MULTITHREAD
+								s1p00r = tmp + 0x00;
 	  #if !COMPACT_OBJ
 		r01r = r00r + 0x02;		s1p01r = tmp + 0x02;
 		r02r = r00r + 0x04;		s1p02r = tmp + 0x04;
@@ -533,6 +589,7 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 		r46r = r00r + 0x5c;		s1p46r = tmp + 0x5c;
 		r47r = r00r + 0x5e;		s1p47r = tmp + 0x5e;
 	  #endif
+	#endif
 		tmp += 0x60;
 		two     = tmp + 0;	// AVX+ versions of radix-8,16,32 twiddleless-DFT macros need consts [2,1,sqrt2,isrt2] quartet laid out thusly
 		one     = tmp + 1;
@@ -807,23 +864,29 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 
 	  #ifdef USE_AVX512
 	   #ifdef CARRY_16_WAY
+		#ifndef MULTITHREAD
 		n_minus_sil   = (struct uint32x16*)sse_n + 1;
 		n_minus_silp1 = (struct uint32x16*)sse_n + 2;
 		sinwt         = (struct uint32x16*)sse_n + 3;
 		sinwtm1       = (struct uint32x16*)sse_n + 4;
+		#endif
 		nbytes += 256;
 	   #else
+		#ifndef MULTITHREAD
 		n_minus_sil   = (struct uint32x8 *)sse_n + 1;
 		n_minus_silp1 = (struct uint32x8 *)sse_n + 2;
 		sinwt         = (struct uint32x8 *)sse_n + 3;
 		sinwtm1       = (struct uint32x8 *)sse_n + 4;
+		#endif
 		nbytes += 128;
 	   #endif
 	  #elif defined(USE_AVX)
+		#ifndef MULTITHREAD
 		n_minus_sil   = (struct uint32x4 *)sse_n + 1;
 		n_minus_silp1 = (struct uint32x4 *)sse_n + 2;
 		sinwt         = (struct uint32x4 *)sse_n + 3;
 		sinwtm1       = (struct uint32x4 *)sse_n + 4;
+		#endif
 		nbytes += 64;
 	  #endif
 
@@ -835,10 +898,12 @@ int radix48_ditN_cy_dif1(double a[], int n, int nwt, int nwt_bits, double wt0[],
 			tmp = tm2;		tm2 += cslots_in_local_store;
 		}
 
-	#ifdef USE_AVX
+	#ifndef MULTITHREAD
+	 #ifdef USE_AVX
 		bjmodn = (int*)(sinwtm1 + RE_IM_STRIDE);
-	#else
+	 #else
 		bjmodn = (int*)(sse_n   + RE_IM_STRIDE);
+	 #endif
 	#endif
 
 	#endif	// USE_SSE2
@@ -1639,25 +1704,31 @@ void radix48_dit_pass1(double a[], int n)
 	{
 		struct cy_thread_data_t* thread_arg = targ;	// Move to top because scalar-mode carry pointers taken directly from it
 		double *addr;
+	  #ifdef USE_SSE2
 		const int pfetch_dist = PFETCH_DIST;
+	  #endif
 		const int stride = (int)RE_IM_STRIDE << 1;	// main-array loop stride = 2*RE_IM_STRIDE
 		uint32 p01,p02,p03,p04,p05,p06,p07,p08,p16,p32;
 		int poff[RADIX>>2];
-		int incr,j,j1,j2,jt,jp,k,l,ntmp;
+		int j,j1,l;
+	  #ifndef USE_SSE2
+		int j2,jt,jp;
+	  #endif
+	  #if !defined(USE_SSE2) || defined(USE_AVX512)
+		int ntmp;
+	  #endif
+	  #ifdef USE_SSE2
+		int incr;
 		// incr = Carry-chain wts-multipliers recurrence length;
 		//incr must divide RADIX/[n-wayness of carry macro], e.g. RADIX/[16|8|4] = 3|6|12 for avx512,avx,sse, respectively:
-	  #ifdef USE_SSE2	// For RADIX = 48, the only SIMD mode for which recurrence is possible is SSE2, others are don't-care
+	    // For RADIX = 48, the only SIMD mode for which recurrence is possible is SSE2, others are don't-care
 		const int incr_long = 12,incr_med = 6,incr_short = 4;
-	  #else
-		const int incr_long = 0,incr_med = 0,incr_short = 0;
-		uint32 p0123[4];
-	  #endif
 	  // Have no specialized HIACC carry macro in USE_AVX512 and ARMv8 SIMD, use nonzero incr-value to differenetiate vs AVX/AVX2:
-	  #if defined(USE_AVX512) || defined(USE_ARM_V8_SIMD)
+	   #if defined(USE_AVX512) || defined(USE_ARM_V8_SIMD)
 		const int incr_hiacc = 2;
-	  #else
+	   #else
 		const int incr_hiacc = 0;
-	  #endif
+	   #endif
 		// Allows cy-macro error data to be used to fiddle incr on the fly to a smaller, safer value if necessary
 		if(USE_SHORT_CY_CHAIN == 0)
 			incr = incr_long;
@@ -1667,8 +1738,13 @@ void radix48_dit_pass1(double a[], int n)
 			incr = incr_short;
 		else
 			incr = incr_hiacc;
+	#else
+		uint32 p0123[4];
+	#endif
 
+	#ifndef USE_AVX
 		double wtl,wtlp1,wtn,wtnm1;	/* Mersenne-mod weights stuff */
+	#endif
 	#ifdef USE_AVX512
 		double t0,t1,t2,t3;
 	  #ifdef CARRY_16_WAY
@@ -1681,7 +1757,9 @@ void radix48_dit_pass1(double a[], int n)
 	#else
 		int n_minus_sil,n_minus_silp1,sinwt,sinwtm1;
 	#endif
+	#ifndef USE_SSE2
 		double wt_re,wt_im, wi_re,wi_im;	// Fermat-mod/LOACC weights stuff, used in both scalar and SIMD mode
+	#endif
 
 	#ifdef USE_SSE2
 	  #if COMPACT_OBJ
@@ -1689,25 +1767,45 @@ void radix48_dit_pass1(double a[], int n)
 		int k0,k1,k2,k3,k4,k5,k6,k7,k8,k9,ka,kb,kc,kd,ke,kf;
 	  #endif
 		int po_kperm[16],*po_ptr = &(po_kperm[0]);
+	  #ifndef USE_AVX512
 		const double crnd = 3.0*0x4000000*0x2000000;
-		int *itmp,*itm2;	// Pointer into the bjmodn array
+	  #endif
+		int *itmp;	// Pointer into the bjmodn array
+	  #if defined(USE_AVX) && !defined(USE_AVX512)
+		int *itm2;	// Pointer into the bjmodn array
+	  #endif
+	  #ifndef USE_AVX
 		struct complex *ctmp;	// Hybrid AVX-DFT/SSE2-carry scheme used for Mersenne-mod needs a 2-word-double pointer
-		double *add0,*add1,*add2,*add3,*add4,*add5,*add6,*add7,*add8,*add9,*adda,*addb,*addc,*addd,*adde,*addf;
+	  #endif
+		double *add0,*add1,*add2,*add3;
 		int *bjmodn;	// Alloc mem for this along with other 	SIMD stuff
-		vec_dbl *sqrt2,*isrt2,*two,*one, *cc0, *ss0, *cc1, *ss1, *max_err, *sse2_rnd, *half_arr,
-			*r00r,*r01r,*r02r,*r03r,*r04r,*r05r,*r06r,*r07r,
+		vec_dbl /* *sqrt2, */*isrt2,*two,/* *one, *cc0, *ss0, */ *cc1, /* *ss1, */ *max_err,
+		  #ifndef USE_AVX512
+			*sse2_rnd,
+		  #endif
+			*half_arr,
+			*r00r,
+		  #if !COMPACT_OBJ
+			*r01r,*r02r,*r03r,*r04r,*r05r,*r06r,*r07r,
 			*r08r,*r09r,*r10r,*r11r,*r12r,*r13r,*r14r,*r15r,
 			*r16r,*r17r,*r18r,*r19r,*r20r,*r21r,*r22r,*r23r,
 			*r24r,*r25r,*r26r,*r27r,*r28r,*r29r,*r30r,*r31r,
 			*r32r,*r33r,*r34r,*r35r,*r36r,*r37r,*r38r,*r39r,
 			*r40r,*r41r,*r42r,*r43r,*r44r,*r45r,*r46r,*r47r,
-			*s1p00r,*s1p01r,*s1p02r,*s1p03r,*s1p04r,*s1p05r,*s1p06r,*s1p07r,*s1p08r,*s1p09r,*s1p10r,*s1p11r,
+		  #endif
+			*s1p00r,
+		  #if !COMPACT_OBJ
+			*s1p01r,*s1p02r,*s1p03r,*s1p04r,*s1p05r,*s1p06r,*s1p07r,*s1p08r,*s1p09r,*s1p10r,*s1p11r,
 			*s1p12r,*s1p13r,*s1p14r,*s1p15r,*s1p16r,*s1p17r,*s1p18r,*s1p19r,*s1p20r,*s1p21r,*s1p22r,*s1p23r,
 			*s1p24r,*s1p25r,*s1p26r,*s1p27r,*s1p28r,*s1p29r,*s1p30r,*s1p31r,*s1p32r,*s1p33r,*s1p34r,*s1p35r,
 			*s1p36r,*s1p37r,*s1p38r,*s1p39r,*s1p40r,*s1p41r,*s1p42r,*s1p43r,*s1p44r,*s1p45r,*s1p46r,*s1p47r,
+		  #endif
 			*cy;	// Need RADIX/2 slots for sse2 carries, RADIX/4 for avx
-		vec_dbl *tmp,*tm1,*tm2;	// Non-static utility ptrs
+		vec_dbl *tmp,*tm1;	// Non-static utility ptrs
+	  #ifndef USE_AVX512
+		vec_dbl *tm2;	// Non-static utility ptrs
 		double dtmp;
+	  #endif
 		uint64 *sign_mask, *sse_bw, *sse_sw, *sse_n;
 
 	#else
@@ -1719,7 +1817,7 @@ void radix48_dit_pass1(double a[], int n)
 		int m,m2;
 		double wt,wtinv,wtA,wtB,wtC;	/* Mersenne-mod weights stuff */
 		int bjmodn[RADIX];	// Thread only carries a base datum here, must alloc a local array for remaining values
-		double *cy = thread_arg->cy, rt,it,temp,frac,
+		double *cy = thread_arg->cy, temp,frac,
 			t00,t01,t02,t03,t04,t05;
 		struct complex t[RADIX], *tptr;
 		int *itmp;	// Pointer into the bjmodn array
@@ -1727,7 +1825,6 @@ void radix48_dit_pass1(double a[], int n)
 	#endif
 
 	// int data:
-		int iter = thread_arg->iter;
 		int NDIVR = thread_arg->ndivr;
 		int n = NDIVR*RADIX;
 		int target_idx = thread_arg->target_idx;
@@ -1745,7 +1842,7 @@ void radix48_dit_pass1(double a[], int n)
 
 	// double data:
 		double maxerr = thread_arg->maxerr;
-		double scale = thread_arg->scale;	int full_pass = scale < 0.5;
+		double scale = thread_arg->scale;
 		double prp_mult = thread_arg->prp_mult;
 
 	// pointer data:
@@ -1845,18 +1942,18 @@ void radix48_dit_pass1(double a[], int n)
 	  #endif
 		tmp += 0x60;
 		two     = tmp + 0;	// AVX+ versions of radix-8,16,32 twiddleless-DFT macros need consts [2,1,sqrt2,isrt2] quartet laid out thusly
-		one     = tmp + 1;
-		sqrt2   = tmp + 2;
+		//one     = tmp + 1;
+		//sqrt2   = tmp + 2;
 		isrt2   = tmp + 3;
-		cc0		= tmp + 4;
-		ss0		= tmp + 5;
+		//cc0		= tmp + 4;
+		//ss0		= tmp + 5;
 		cc1		= tmp + 6;
-		ss1		= tmp + 7;
+		//ss1		= tmp + 7;
 		tmp += 0x08;	// sc_ptr += 0xc8
 	  #ifdef USE_AVX512
 		cy = tmp;		tmp += 0x06;
 		max_err = tmp + 0x00;
-		sse2_rnd= tmp + 0x01;
+		//sse2_rnd= tmp + 0x01;
 		half_arr= tmp + 0x02;
 	  #elif defined(USE_AVX)
 		cy = tmp;		tmp += 0x0c;	// sc_ptr += 0xd4
diff --git a/src/radix48_main_carry_loop.h b/src/radix48_main_carry_loop.h
index 4ae0e2bc..d7f4fe60 100755
--- a/src/radix48_main_carry_loop.h
+++ b/src/radix48_main_carry_loop.h
@@ -23,7 +23,7 @@
 // This main loop is same for un-and-multithreaded, so stick into a header file
 // (can't use a macro because of the #if-enclosed stuff).
 
-for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
+for(int k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 {
 	/* In SIMD mode, data are arranged in [re_0,...,re_n-1,im_0,...,im_n-1] groups, not the usual [re_0,im_0],...,[re_n-1,im_n-1] pairs.
 	Thus we can still increment the j-index as if stepping through the residue array-of-doubles in strides of 2,
@@ -34,7 +34,9 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 	{
 		j1 =  j;
 		j1 = j1 + ( (j1 >> DAT_BITS) << PAD_BITS );	/* padded-array fetch index is here */
+	  #ifndef USE_SSE2
 		j2 = j1 + RE_IM_STRIDE;
+	  #endif
 
 	/*...The radix-48 DIT pass is here:	*/
 	  #ifdef USE_ARM_V8_SIMD
@@ -331,7 +333,10 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 	  #endif
 		i = (!j);
 		addr = &prp_mult;
-		tmp = s1p00r; tm1 = cy; tm2 = cy+1; itmp = bjmodn; itm2 = bjmodn+4;
+		tmp = s1p00r; tm1 = cy; itmp = bjmodn;
+	  #ifndef USE_AVX512
+		tm2 = cy+1; itm2 = bjmodn+4;
+	  #endif
 		for(l = 0; l < nloop; l++) {
 			// Each AVX carry macro call also processes 8 prefetches of main-array data
 			add0 = a + j1 + pfetch_dist + poff[l+l];
@@ -770,4 +775,4 @@ for(k=1; k <= khi; k++)	/* Do n/(radix(1)*nwt) outer loop executions...	*/
 
 	col += RADIX;
 	co3 -= RADIX;
-}	/* end for(k=1; k <= khi; k++) */
+}	/* end for(int k=1; k <= khi; k++) */