diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..e97e5fba
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,244 @@
+name: CI
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron:  '0 0 1 * *'
+
+jobs:
+  Linux:
+    name: Mlucas Linux
+
+    runs-on: ${{ matrix.os }}
+    continue-on-error: ${{ matrix.cc == 'clang' }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, ubuntu-22.04]
+        cc: [gcc, clang]
+      fail-fast: false
+    env:
+      CC: ${{ matrix.cc }}
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        sudo apt-get -yqq update
+        sudo apt-get -yqq install libhwloc-dev
+    - name: Before script
+      run: |
+        sed -i 's/-j/-O -j/' makemake.sh
+        $CC --version
+    - name: Script
+      run: |
+        set -x
+        set -o pipefail
+        lscpu
+        cat /proc/cpuinfo
+        bash -e -o pipefail -- makemake.sh use_hwloc
+        (cd obj; make clean)
+        for arg in '' '1word' '2word' '3word' '4word' 'nword'; do echo -e "\nMfactor $arg\n"; bash -e -o pipefail -- makemake.sh mfac $arg use_hwloc; (cd obj_mfac; make clean); done
+        cd obj
+        echo -e '## Warnings\n```' >> $GITHUB_STEP_SUMMARY
+        grep 'warning:' build.log | sed 's/\x1B\[\([0-9]\+\(;[0-9]\+\)*\)\?m//g' | awk '{ print $NF }' | sort | uniq -c | sort -nr >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+        for s in tt t s m l; do time ./Mlucas -s $s -core "0:$(( $(nproc --all) - 1 ))" |& tee -a test.log; done
+    - uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: ${{ matrix.os }}_${{ matrix.cc }}_mlucas
+        path: ${{ github.workspace }}
+
+  ASan-Linux:
+    name: AddressSanitizer Linux
+
+    runs-on: ubuntu-latest
+    continue-on-error: ${{ matrix.cc == 'clang' }}
+    strategy:
+      matrix:
+        cc: [gcc, clang]
+      fail-fast: false
+    env:
+      CC: ${{ matrix.cc }}
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        sudo apt-get -yqq update
+        sudo apt-get -yqq install libhwloc-dev
+    - name: Before script
+      run: |
+        sed -i 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
+        sed -i 's/-j/-O -j/' makemake.sh
+        $CC --version
+    - name: Script
+      run: |
+        set -x
+        bash -e -o pipefail -- makemake.sh use_hwloc
+        cd obj
+        make clean
+        time ./Mlucas -s m -core "0:$(( $(nproc --all) - 1 ))"
+      continue-on-error: true
+
+  TSan-Linux:
+    name: ThreadSanitizer Linux
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        cc: [gcc, clang]
+      fail-fast: false
+    env:
+      CC: ${{ matrix.cc }}
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        sudo apt-get -yqq update
+        sudo apt-get -yqq install libhwloc-dev
+    - name: Before script
+      run: |
+        sed -i 's/-O3/-Og -fsanitize=thread/' makemake.sh
+        sed -i 's/-j/-O -j/' makemake.sh
+        $CC --version
+    - name: Script
+      run: |
+        set -x
+        bash -e -o pipefail -- makemake.sh use_hwloc
+        cd obj
+        make clean
+        time ./Mlucas -s s -core "0:$(( $(nproc --all) - 1 ))"
+      continue-on-error: true
+
+  GCC-analyzer:
+    name: GCC analyzer
+
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: GCC analyzer
+      run: |
+        set -x
+        set -o pipefail
+        gcc -c -fdiagnostics-color -g -O3 -march=native -DUSE_THREADS -fanalyzer src/*.c |& tee analyzer.log
+        rm -- *.o
+        echo -e '## GCC analyzer\n```' >> $GITHUB_STEP_SUMMARY
+        grep 'warning:' analyzer.log | sed 's/\x1B\[\([0-9]\+\(;[0-9]\+\)*\)\?m//g' | awk '{ print $NF }' | sort | uniq -c | sort -nr >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+
+  Clang-Tidy:
+    name: Clang-Tidy
+
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Clang-Tidy
+      run: |
+        set -x
+        set -o pipefail
+        clang-tidy --use-color -checks='bugprone-*,cert-*,clang-analyzer-*,concurrency-*,misc-const-correctness,misc-redundant-expression,misc-unused-*,modernize-*,performance-*,portability-*,readability-const-return-type,readability-container-*,readability-duplicate-include,readability-else-after-return,readability-make-member-function-cons,readability-non-const-parameter,readability-redundant-*,readability-simplify-*,readability-string-compare,readability-use-anyofallof' -header-filter='.*' src/*.c -- -Wall -O3 -march=native -DUSE_THREADS |& tee clang-tidy.log
+        echo -e '## Clang-Tidy\n```' >> $GITHUB_STEP_SUMMARY
+        grep 'warning:' clang-tidy.log | sed 's/\x1B\[\([0-9]\+\(;[0-9]\+\)*\)\?m//g' | awk '{ print $NF }' | sort | uniq -c | sort -nr >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+
+  Cppcheck:
+    name: Cppcheck
+
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        sudo apt-get -yqq update
+        sudo apt-get -yqq install cppcheck
+    - name: Cppcheck
+      run: |
+        mkdir build
+        cppcheck --enable=all -DUSE_THREADS --force --cppcheck-build-dir=build -j "$(nproc)" --clang .
+        cppcheck --enable=all -DUSE_THREADS --force --cppcheck-build-dir=build --clang . &> cppcheck.log
+        echo -e '## Cppcheck\n```' >> $GITHUB_STEP_SUMMARY
+        grep '\(error\|warning\|style\|performance\|portability\|information\):' cppcheck.log | awk '{ print $2, $NF }' | sort | uniq -c | sort -nr >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+
+  ShellCheck:
+    name: ShellCheck
+
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: ShellCheck
+      run: shopt -s globstar; shellcheck -o avoid-nullary-conditions,check-extra-masked-returns,check-set-e-suppressed,deprecate-which,quote-safe-variables,require-double-brackets -s bash **/*.sh
+      continue-on-error: true
+
+  macOS:
+    name: Mlucas macOS
+
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [macos-11, macos-12, macos-13]
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        brew install hwloc
+    - name: Script
+      run: |
+        set -x
+        set -o pipefail
+        sysctl -a
+        bash -e -o pipefail -- makemake.sh use_hwloc
+        (cd obj; make clean)
+        for arg in '' '1word' '2word' '3word' '4word' 'nword'; do echo -e "\nMfactor $arg\n"; bash -e -o pipefail -- makemake.sh mfac $arg use_hwloc; (cd obj_mfac; make clean); done
+        cd obj
+        echo -e '## Warnings\n```' >> $GITHUB_STEP_SUMMARY
+        grep 'warning:' build.log | sed 's/\x1B\[\([0-9]\+\(;[0-9]\+\)*\)\?m//g' | awk '{ print $NF }' | sort | uniq -c | sort -nr >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+        for s in tt t s m l; do time ./Mlucas -s $s -core "0:$(( $(sysctl -n hw.ncpu) - 1 ))" 2>&1 | tee -a test.log; done
+    - uses: actions/upload-artifact@v3
+      if: always()
+      with:
+        name: ${{ matrix.os }}_mlucas
+        path: ${{ github.workspace }}
+
+  ASan-macOS:
+    name: AddressSanitizer macOS
+
+    runs-on: macos-13
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        brew install hwloc
+    - name: Before script
+      run: |
+        sed -i '' 's/-O3/-Og -fsanitize=address,undefined/' makemake.sh
+    - name: Script
+      run: |
+        set -x
+        bash -e -o pipefail -- makemake.sh use_hwloc
+        cd obj
+        make clean
+        time ./Mlucas -s m -core "0:$(( $(sysctl -n hw.ncpu) - 1 ))"
+      continue-on-error: true
+
+  TSan-macOS:
+    name: ThreadSanitizer macOS
+
+    runs-on: macos-13
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install
+      run: |
+        brew install hwloc
+    - name: Before script
+      run: |
+        sed -i '' 's/-O3/-Og -fsanitize=thread/' makemake.sh
+    - name: Script
+      run: |
+        set -x
+        bash -e -o pipefail -- makemake.sh use_hwloc
+        cd obj
+        make clean
+        time ./Mlucas -s s -core "0:$(( $(sysctl -n hw.ncpu) - 1 ))"
+      continue-on-error: true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..bd218e69
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+obj*/
diff --git a/makemake.sh b/makemake.sh
old mode 100755
new mode 100644
index 067664c4..17eba3a7
--- a/makemake.sh
+++ b/makemake.sh
@@ -6,7 +6,7 @@
 
 ################################################################################
 #                                                                              #
-#   (C) 2021 by Ernst W. Mayer.                                                #
+#   (C) 2021 by Ernst W. Mayer and Teal Dulcet.                                #
 #                                                                              #
 #  This program is free software; you can redistribute it and/or modify it     #
 #  under the terms of the GNU General Public License as published by the       #
@@ -26,9 +26,11 @@
 #                                                                              #
 ################################################################################
 
-# Exit if any of the sommands fail:
+# Exit if any of the commands fail:
 set -e
 
+shopt -s nocasematch
+
 # for mode in avx512 avx2 avx sse2; do
 	# if grep -iq "$mode" /proc/cpuinfo; then
 		# echo -e "The CPU supports the ${mode^^} SIMD build mode.\n"
@@ -37,21 +39,18 @@ set -e
 	# fi
 # done
 
-# $0 contains script-name, but $@ starts with first ensuing cmd-line arg, if it exists:
-echo "Total number of arguments : $#"
-for ((i=1; i<=$#; ++i)); do
-	echo "Arg[$i] = ${!i}"
-done
-if [[ $# -gt 1 ]]; then
-	echo "Usage: $0 [SIMD build mode]" >&2
-	echo "Only 1 optional argument supported, it must be one of the supported SIMD-arithmetic types:" >&2
-	echo -e "\t[x86_64: avx512_skylake avx512_knl avx2 avx sse2]; [Armv8: asimd]; or 'nosimd' for scalar-double build.\n" >&2
-	exit 1
-fi
-
 DIR=obj
-EXE=Mlucas
-ARGS=()
+Mlucas=Mlucas
+Mfactor=Mfactor
+TARGET=$Mlucas
+ARGS=(-DUSE_THREADS) # Optional compile args
+WORDS=''
+# Optional link args
+LARG=()
+
+MODES=()
+GMP=1
+HWLOC=0
 
 if echo "$OSTYPE" | grep -iq 'darwin'; then
 	echo -e "MacOS detected for build host.\n"
@@ -61,111 +60,257 @@ else # echo "$OSTYPE" | grep -iq 'linux'
 	CPU_THREADS=$(nproc --all)
 fi
 
-# Thx to tdulcet for streamlined case-based syntax here, but ugh - non-matching ')', really?:
-if [[ $# -eq 1 ]]; then
-
-	if [ "$1" = 'avx512_skylake' ]; then
-		echo "Building for avx512_skylake SIMD in directory obj_$1; the executable will be named Mlucas_$1"
-		ARGS+=( "-DUSE_AVX512" -march=skylake-avx512 )
-	elif [ "$1" = 'avx512_knl' ]; then
-		echo "Building for avx2 SIMD in directory obj_$1; the executable will be named Mlucas_$1"
-		ARGS+=( "-DUSE_AVX512" -march=knl )
-	elif [ "$1" = 'avx2' ]; then
-		echo "Building for avx2 SIMD in directory obj_$1; the executable will be named Mlucas_$1"
-		ARGS+=( "-DUSE_AVX2" -mavx2 )
-	elif [ "$1" = 'avx' ]; then
-		echo "Building for avx SIMD in directory obj_$1; the executable will be named Mlucas_$1"
-		ARGS+=( "-DUSE_AVX" -mavx )
-	elif [ "$1" = 'sse2' ]; then
-		echo "Building for sse2 SIMD in directory obj_$1; the executable will be named Mlucas_$1"
-		ARGS+=( "-DUSE_SSE2" )
-	elif [ "$1" = 'asimd' ]; then
-		echo "Building for avx2 SIMD in directory obj_$1; the executable will be named Mlucas_$1"
-		ARGS+=( "-DUSE_ARM_V8_SIMD" )
-	elif [ "$1" = 'nosimd' ]; then
-		echo "Building in scalar-double (no-SIMD) mode in directory obj_$1; the executable will be named Mlucas_$1"
-		# This one's a no-op
+if ! command -v make >/dev/null; then
+	echo "Error: This script requires Make" >&2
+	echo "On Ubuntu and Debian run: 'sudo apt-get update' and 'sudo apt-get install build-essential -y'" >&2
+	exit 1
+fi
+if [[ -n $CC ]]; then
+	if ! command -v "$CC" >/dev/null; then
+		echo "Error: $CC is not installed." >&2
+		exit 1
+	fi
+elif ! command -v gcc >/dev/null; then
+	echo "Error: This script requires the GNU C compiler" >&2
+	echo "On Ubuntu and Debian run: 'sudo apt-get update' and 'sudo apt-get install build-essential -y'" >&2
+	exit 1
+fi
+
+# $0 contains script-name, but $@ starts with first ensuing cmd-line arg, if it exists:
+echo "Total number of input parameters = $#"
+
+# v21: Keep the cross-platform-build arch-specifying command-line flag, but now also need to
+# support several added ones for 3rd-party-library usage. This needs to be in arbitrary argument
+# order fashion, [details snipped]
+arglist=("$@") # Local array into which we copy cmd-line args in order to be able to manipulate them
+for i in "${!arglist[@]}"; do
+	echo "Arg[$i] = ${arglist[i]}"
+done
+# Now loop over the optional args and execute the above-described preprocessing step:
+for arg in "$@"; do
+
+	case ${arg} in
+		'no_gmp')
+			GMP=0
+			;;
+		'use_hwloc')
+			HWLOC=1
+			;;
+		'avx512_skylake' | 'avx512_knl' | 'k1om' | 'avx2' | 'avx' | 'sse2' | 'asimd' | 'nosimd')
+			MODES+=("$arg")
+			;;
+		'mfac')
+			TARGET=$Mfactor
+			;;
+		'1word' | '2word' | '3word' | '4word' | 'nword')
+			WORDS=$arg
+			;;
+		*)
+			echo "Usage: $0 [SIMD build mode]" >&2
+			echo "Optional arguments must be 'no_gmp', 'use_hwloc' or one and only one of the supported SIMD-arithmetic types:" >&2
+			echo -e "\t[x86_64: avx512_skylake avx512_knl k1om avx2 avx sse2]; [Armv8: asimd]; or 'nosimd' for scalar-double build.\n" >&2
+			exit 1
+			;;
+	esac
+
+done
+
+if ((GMP)); then
+	LARG+=(-lgmp)
+else
+	echo "Building sans Gnu-MP ... this means no GCDs will be taken in p-1 work."
+	ARGS+=(-DINCLUDE_GMP=0)
+fi
+
+if ((HWLOC)); then
+	echo "Building with HWLOC hardware-topology support."
+	ARGS+=(-DINCLUDE_HWLOC=1)
+	LARG+=(-lhwloc)
+fi
+
+if [[ $TARGET == "$Mfactor" ]]; then
+	DIR+=_mfac
+	# trap "rm $PWD/src/factor.c" EXIT
+	# cp -vf src/factor.c{.txt,}
+fi
+
+if [[ -n $WORDS ]]; then
+	if [[ $TARGET == "$Mfactor" ]]; then
+		arg=$WORDS
+		if [[ ${arg} == 'nword' ]]; then
+			WORDS=-DNWORD
+		else
+			WORDS=-DP"${arg::1}"WORD
+		fi
+		Mfactor+="_$arg"
+		TARGET=$Mfactor
 	else
-		echo "Unrecognized SIMD-build flag ... aborting."
+		echo "Error: The argument '$WORDS' requires 'mfac'." >&2
 		exit 1
 	fi
+fi
+
+# First if/elif clause handles cross-platform builds and non-default values for "Use GMP?" and "Use HWLOC?":
+# o "Use GMP" = TRUE is default in link step, 'no_gmp' overrides;
+# o "Use HWLOC" = FALSE is default, 'use_hwloc' overrides.
+# Thx to tdulcet for offering a streamlined case-based syntax here, but ugh - non-matching ')', really?:
+if [[ ${#MODES[*]} -gt 1 ]]; then
+	echo -e "Only one arch-specifying optional argument is allowed ... aborting." >&2
+	exit 1
+fi
+
+if [[ ${#MODES[*]} -eq 1 ]]; then
 
-	DIR+="_$1"
-	EXE+="_$1"
+	arg=${MODES[0]}
 
-elif uname -a | grep -iq 'Mac'; then
+	case ${arg} in
+		'avx512_skylake')
+			echo "Building for avx512_skylake SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_AVX512 -march=skylake-avx512)
+			;;
+		'avx512_knl')
+			echo "Building for avx512_knl SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_AVX512 -march=knl)
+			;;
+		'k1om')
+			echo "Building for 1st-gen Xeon Phi 512-bit SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_IMCI512)
+			;;
+		'avx2')
+			echo "Building for avx2 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_AVX2 -mavx2)
+			;;
+		'avx')
+			echo "Building for avx SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_AVX -mavx)
+			;;
+		'sse2')
+			echo "Building for sse2 SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_SSE2)
+			;;
+		'asimd')
+			echo "Building for asimd SIMD in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			ARGS+=(-DUSE_ARM_V8_SIMD)
+			;;
+		'nosimd')
+			echo "Building in scalar-double (no-SIMD) mode in directory '${DIR}_${arg}'; the executable will be named '${TARGET}'"
+			# This one's a no-op
+			;;
+		*)
+			echo "Unrecognized SIMD-build flag ... aborting." >&2
+			exit 1
+			;;
+	esac
 
-	echo -e "MacOS detected.\n"
+	DIR+="_$arg"
+
+elif echo "$OSTYPE" | grep -iq 'darwin'; then
+
+	# MacOS:
 	if sysctl -a | grep machdep.cpu.features | grep -iq 'avx512'; then
 		echo -e "The CPU supports the AVX512 SIMD build mode.\n"
-		ARGS+=( "-DUSE_AVX512" -march=native )
+		ARGS+=(-DUSE_AVX512 -march=native)
 	elif sysctl -a | grep machdep.cpu.features | grep -iq 'avx2'; then
 		echo -e "The CPU supports the AVX2 SIMD build mode.\n"
-		ARGS+=( "-DUSE_AVX2" -march=native -mavx2 )
+		ARGS+=(-DUSE_AVX2 -march=native -mavx2)
 	elif sysctl -a | grep machdep.cpu.features | grep -iq 'avx'; then
 		echo -e "The CPU supports the AVX SIMD build mode.\n"
-		ARGS+=( "-DUSE_AVX" -march=native -mavx )
+		ARGS+=(-DUSE_AVX -march=native -mavx)
 	elif sysctl -a | grep machdep.cpu.features | grep -iq 'sse2'; then
 		echo -e "The CPU supports the SSE2 SIMD build mode.\n"
-		ARGS+=( "-DUSE_SSE2" -march=native )
+		# On my Core2Duo Mac, 'native' gives "error: bad value for -march= switch":
+		ARGS+=(-DUSE_SSE2 -march=core2)
 	elif sysctl -a | grep machdep.cpu.features | grep -iq 'asimd'; then
 		echo -e "The CPU supports the ASIMD build mode.\n"
-		ARGS+=( "-DUSE_ARM_V8_SIMD" -march=native )
+		ARGS+=(-DUSE_ARM_V8_SIMD -march=native)
 	else
 		echo -e "The CPU supports no Mlucas-recognized ASIMD build mode ... building in scalar-double mode.\n"
-		ARGS+=( -march=native )
+		ARGS+=(-march=native)
 	fi
 
 else
 
-	echo -e "Assuming OS = Linux.\n"
+	# Linux:
 	if grep -iq 'avx512' /proc/cpuinfo; then
 		echo -e "The CPU supports the AVX512 SIMD build mode.\n"
-		ARGS+=( "-DUSE_AVX512" -march=native )
+		ARGS+=(-DUSE_AVX512 -march=native)
 	elif grep -iq 'avx2' /proc/cpuinfo; then
 		echo -e "The CPU supports the AVX2 SIMD build mode.\n"
-		ARGS+=( "-DUSE_AVX2" -march=native -mavx2 )
+		ARGS+=(-DUSE_AVX2 -march=native -mavx2)
 	elif grep -iq 'avx' /proc/cpuinfo; then
 		echo -e "The CPU supports the AVX SIMD build mode.\n"
-		ARGS+=( "-DUSE_AVX" -march=native -mavx )
+		ARGS+=(-DUSE_AVX -march=native -mavx)
 	elif grep -iq 'sse2' /proc/cpuinfo; then
 		echo -e "The CPU supports the SSE2 SIMD build mode.\n"
-		ARGS+=( "-DUSE_SSE2" -march=native )
+		ARGS+=(-DUSE_SSE2 -march=native)
 	elif grep -iq 'asimd' /proc/cpuinfo; then
 		echo -e "The CPU supports the ASIMD build mode.\n"
-		ARGS+=( "-DUSE_ARM_V8_SIMD" -march=native )
+		ARGS+=(-DUSE_ARM_V8_SIMD -march=native)
 	else
 		echo -e "The CPU supports no Mlucas-recognized ASIMD build mode ... building in scalar-double mode.\n"
-		ARGS+=( -march=native )
+		ARGS+=(-march=native)
 	fi
 
 fi
 
+if [[ -d $DIR ]]; then
+	echo "Warning: '$DIR' already exists"
+fi
+
 # -p prevents "File exists" warning if obj-dir already exists:
 mkdir -p "$DIR"
 cd "$DIR"
 
+if [[ -x $TARGET ]]; then
+	echo "Error: '$DIR/$TARGET' already exists." >&2
+	exit 1
+fi
+
 # Clang-under-MacOS linker barfs if one tries to explicitly invoke standard libs - h/t tdulcet for the
 # conditional-inline syntax. Some OSes put the GMP headers in /usr/local/include, so -I that path in the
 # compile command. If said path does not exist, make silently ignores it.
 # Re. the -g flag to include the debugging symbols, they bloat executable size but if someone's Mlucas
 # crashes/segfaults, one can rerun with GDB (gdb -ex=r ./Mlucas) to see the filename, line number and
 # stack trace of the issue. If one wishes, one can run 'strip -g Mlucas' to remove the debugging symbols:
-cat << EOF > Makefile
+cat <<EOF >Makefile
 CC?=gcc
-OBJS=\$(patsubst ../src/%.c, %.o, \$(wildcard ../src/*.c))
+CFLAGS=-fdiagnostics-color -Wall -g -O3 # -flto=auto
+CPPFLAGS=-I/usr/local/include
+LDLIBS=$(echo "$OSTYPE" | grep -iq 'darwin' || echo "-lm -lpthread -lrt") ${LARG[@]}
+
+OBJS=br.o dft_macro.o fermat_mod_square.o fgt_m61.o get_cpuid.o get_fft_radices.o get_fp_rnd_const.o get_preferred_fft_radix.o getRealTime.o imul_macro.o mers_mod_square.o mi64.o Mlucas.o pairFFT_mul.o pair_square.o pm1.o qfloat.o radix1008_ditN_cy_dif1.o radix1024_ditN_cy_dif1.o radix104_ditN_cy_dif1.o radix10_ditN_cy_dif1.o radix112_ditN_cy_dif1.o radix11_ditN_cy_dif1.o radix120_ditN_cy_dif1.o radix128_ditN_cy_dif1.o radix12_ditN_cy_dif1.o radix13_ditN_cy_dif1.o radix144_ditN_cy_dif1.o radix14_ditN_cy_dif1.o radix15_ditN_cy_dif1.o radix160_ditN_cy_dif1.o radix16_dif_dit_pass.o radix16_ditN_cy_dif1.o radix16_dyadic_square.o radix16_pairFFT_mul.o radix16_wrapper_ini.o radix16_wrapper_square.o radix176_ditN_cy_dif1.o radix17_ditN_cy_dif1.o radix18_ditN_cy_dif1.o radix192_ditN_cy_dif1.o radix208_ditN_cy_dif1.o radix20_ditN_cy_dif1.o radix224_ditN_cy_dif1.o radix22_ditN_cy_dif1.o radix240_ditN_cy_dif1.o radix24_ditN_cy_dif1.o radix256_ditN_cy_dif1.o radix26_ditN_cy_dif1.o radix288_ditN_cy_dif1.o radix28_ditN_cy_dif1.o radix30_ditN_cy_dif1.o radix31_ditN_cy_dif1.o radix320_ditN_cy_dif1.o radix32_dif_dit_pass.o radix32_ditN_cy_dif1.o radix32_dyadic_square.o radix32_wrapper_ini.o radix32_wrapper_square.o radix352_ditN_cy_dif1.o radix36_ditN_cy_dif1.o radix384_ditN_cy_dif1.o radix4032_ditN_cy_dif1.o radix40_ditN_cy_dif1.o radix44_ditN_cy_dif1.o radix48_ditN_cy_dif1.o radix512_ditN_cy_dif1.o radix52_ditN_cy_dif1.o radix56_ditN_cy_dif1.o radix5_ditN_cy_dif1.o radix60_ditN_cy_dif1.o radix63_ditN_cy_dif1.o radix64_ditN_cy_dif1.o radix6_ditN_cy_dif1.o radix72_ditN_cy_dif1.o radix768_ditN_cy_dif1.o radix7_ditN_cy_dif1.o radix80_ditN_cy_dif1.o radix88_ditN_cy_dif1.o radix8_dif_dit_pass.o radix8_ditN_cy_dif1.o radix960_ditN_cy_dif1.o radix96_ditN_cy_dif1.o radix992_ditN_cy_dif1.o radix9_ditN_cy_dif1.o rng_isaac.o threadpool.o twopmodq100.o twopmodq128_96.o twopmodq128.o twopmodq160.o twopmodq192.o twopmodq256.o twopmodq64_test.o twopmodq80.o twopmodq96.o twopmodq.o types.o util.o
+OBJS_MFAC=getRealTime.o get_cpuid.o get_fft_radices.o get_fp_rnd_const.o imul_macro.o mi64.o qfloat.o rng_isaac.o twopmodq100.o twopmodq128_96.o twopmodq128.o twopmodq160.o twopmodq192.o twopmodq256.o twopmodq64_test.o twopmodq80.o twopmodq96.o twopmodq.o types.o util.o threadpool.o factor.o
 
-$EXE: \$(OBJS)
-	\$(CC) -Wall -g -o \$@ \$(OBJS) $(echo "$OSTYPE" | grep -iq 'darwin' || echo "-lm -lpthread -lrt") -lgmp
+$Mlucas: \$(OBJS)
+	\$(CC) \$(LDFLAGS) \$(CFLAGS) -o \$@ \$^ \$(LDLIBS)
+$Mfactor: \$(OBJS_MFAC)
+	\$(CC) \$(LDFLAGS) \$(CFLAGS) -o \$@ \$^ \$(LDLIBS)
+factor.o: ../src/factor.c
+	\$(CC) \$(CFLAGS) \$(CPPFLAGS) -c ${ARGS[@]} -DFACTOR_STANDALONE $WORDS -DTRYQ=4 \$<
 %.o: ../src/%.c
-	\$(CC) -Wall -g -c -I/usr/local/include -O3 ${ARGS[@]} -DUSE_THREADS \$<
+	\$(CC) \$(CFLAGS) \$(CPPFLAGS) -c ${ARGS[@]} \$<
 clean:
-	rm -f *.o
+	rm -f \$(OBJS) \$(OBJS_MFAC)
+
+.phony: clean
 EOF
 
-echo -e "\nBuilding Mlucas"
-printf "%s CPU cores detected ... parallel-building using that number of make threads.\n" "$CPU_THREADS"
-if ! make -j "$CPU_THREADS" > build.log 2>&1; then
-	echo -e "There were build errors - see build.log for details.\n"
-	exit 1
+# if [[ -e build.log ]]; then
+	# cp -vf --backup=t build.log{,}
+# fi
+
+echo -e "Building $TARGET"
+printf "%'d CPU cores detected ... parallel-building using that number of make threads.\n" "$CPU_THREADS"
+if ! time make -j "$CPU_THREADS" "$TARGET" >build.log 2>&1; then
+	echo -e "\n*** There were build errors - see '${DIR}/build.log' for details. ***\n" >&2
+	grep -A 2 'error:' build.log || tail build.log
+	# exit 1
 fi
+
+echo -e "\nWarnings:\n"
+grep 'warning:' build.log | awk '{ print $NF }' | sort | uniq -c | sort -nr
+
+echo -e "\nErrors:\n"
+grep -A 2 'error:' build.log || echo "None"
+
+echo
diff --git a/src/gcd_lehmer.c b/src/gcd_lehmer.c
deleted file mode 100755
index 79009a02..00000000
--- a/src/gcd_lehmer.c
+++ /dev/null
@@ -1,4056 +0,0 @@
-/*******************************************************************************
-*                                                                              *
-*   (C) 1997-2016 by Ernst W. Mayer.                                           *
-*                                                                              *
-*  This program is free software; you can redistribute it and/or modify it     *
-*  under the terms of the GNU General Public License as published by the       *
-*  Free Software Foundation; either version 2 of the License, or (at your      *
-*  option) any later version.                                                  *
-*                                                                              *
-*  This program is distributed in the hope that it will be useful, but WITHOUT *
-*  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
-*  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
-*  more details.                                                               *
-*                                                                              *
-*  You should have received a copy of the GNU General Public License along     *
-*  with this program; see the file GPL.txt.  If not, you may view one at       *
-*  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
-*  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
-*  02111-1307, USA.                                                            *
-*                                                                              *
-*******************************************************************************/
-
-/*
-If the FFT-mul enabled, need all the stuff for factor.c + int-gcd,
-including factor.c built with -DINCLUDE_PM1, plus the following FFT stuff:
-
-g42 -c -Wall -g3 -ggdb -DFACTOR_STANDALONE -DTRYQ=4 -DNWORD -DINCLUDE_PM1 factor.c
-g42 -c -Wall -O3 br.c dft*.c get_fft*c *pairFFT*.c
-g42 -c -Wall -O3 radix32_dif_dit_pass.c
-g42 -c -Wall -O3 radix16_dif_dit_pass.c
-g42 -c -Wall -O3 radix8_dif_dit_pass.c
-g42 -c -Wall -O3 radix32_*cy*.c
-g42 -c -Wall -O3 radix16_*cy*.c
-g42 -c -Wall -O3 radix8_*cy*.c
-*/
-
-#include "gcd_lehmer.h"
-
-// Use x86_64 inline-asm?
-#undef YES_ASM
-#if(defined(CPU_IS_X86_64) && defined(COMPILER_TYPE_GCC) && (OS_BITS == 64))
-	#define YES_ASM
-#endif
-
-#define GCD_DEBUG	1
-/*
-   Set == 1 to enable basic per-pass diagnostic printing.
-   Set >= 2 to enable detailed per-pass diagnostic printing.
-			WARNING: level-2 diagnostics not recommended for large vectors!
-*/
-	int fft_gcd_debug = 0;
-	FILE *fp;
-	static char *file_access_mode[2] = {"a","w"};
-	char string0[STR_MAX_LEN];
-#if GCD_DEBUG >= 1
-	// Set GCD_DEBUG >= 1 and this != 0 to selectively enable debug
-	// printing for some particular self-test or phase of the computation:
-	int gcd_debug = 0;
-	char string1[STR_MAX_LEN];
-	char str_10k[10*1024];
-#endif
-#if GCD_DEBUG >= 2
-	char string2[STR_MAX_LEN];
-#endif
-
-#define FFTMUL_THRESHOLD_BITS	16384	/* # of bits in a multiword base-2^64 integer
-										at which to switch from pure-int grammar-school
-										to floating-FFT-based multiply algorithm */
-/***********************/
-
-// Returns: number of 64-bit words in computed abcd-multipliers.
-uint32 fft_gcd_get_mults(
-	const uint64 u[], const uint64 v[], const uint32 vec_len, const uint32 targ_len, const uint32 sec_len,
-	uint64 ai[], uint64 bi[], uint64 ci[], uint64 di[], uint64 w[], uint64 x[], uint64 y[], uint64 z[],
-	double a[], double b[], double c[], double d[], uint32*len_ab, uint32*len_cd, uint32*sign_ab, uint32*sign_cd)
-{
-	uint32 retval = 0;
-	uint32 i,j,k,kk,cy,cy_fwd, eGCD = FALSE, HALF = FALSE;
-	uint32 fft_len = sec_len<<4, pad_len = fft_len + ( (fft_len >> DAT_BITS) << PAD_BITS );	// Padded-array length
-	 int64 cy_re,cy_im;	// Prefer these to be signed
-	uint64 tmp64 = 0,cy1,cy2;
-	const double sign_mult[2] = {1.0,-1.0};
-	const int check_len = 4;	// #high 64-bit words to compute for hi-block cancellation check
-	uint64 tvec0[check_len],tvec1[check_len],tvec2[check_len],tvec3[check_len];
-
-		j = 2*sec_len+10;	// Adding 10 guard words to half-exit EGCD inputs to ensure accuracy of au-bv,cu-dv signs
-		// Copy leading ? bits of input-vecs into a pair of uint64-arrays. 16 bits per double ==> 64-bit vec,
-		// but need 2x as many bits in leadU,V as we will use in ensuing FFT-muls, hence the 2*sec_len:
-		mi64_set_eq(w,u+vec_len-j,j);	// w = high j words of u
-		mi64_set_eq(z,v+vec_len-j,j);	// z = high j words of v
-		// Return value is length of abcd-multipliers:
-		retval = mi64_gcd(w,z, j, eGCD=TRUE,ai,bi,len_ab,sign_ab, HALF=TRUE,ci,di,len_cd,sign_cd, sec_len);
-		ASSERT(HERE, retval == sec_len, "Unexpected retval from mi64_gcd call to get abcd-multipliers!");
-	#if 1
-		if(!*sign_ab) { printf("a*u-b*v > 0\n"); } else if(*sign_ab == 1) { printf("a*u-b*v < 0\n"); } else { ASSERT(HERE, 0, "foo!"); }
-		if(!*sign_cd) { printf("c*u-d*v > 0\n"); } else if(*sign_cd == 1) { printf("c*u-d*v < 0\n"); } else { ASSERT(HERE, 0, "bar!"); }
-	#endif
-		// Verify that resulting abcd-multipliers have at most (sec_len) set words:
-		i = sec_len;
-		ASSERT(HERE, ai[i] == 0ull && bi[i] == 0ull && ci[i] == 0ull && di[i] == 0ull, "abcd-multipliers exceed expected length!");
-		// Convert a+I*b and c+I*d to packed-double form, store outputs in double-arrays A,B, resp.:
-		cy = mi64_cvt_uint64_double(ai,bi,0,sec_len, a);
-		ASSERT(HERE, cy == 0, "FFT_gcd: unexpected carryout of mi64_cvt_uint64_double!");
-		cy = mi64_cvt_uint64_double(ci,di,0,sec_len, b);
-		ASSERT(HERE, cy == 0, "FFT_gcd: unexpected carryout of mi64_cvt_uint64_double!");
-		// Zero upper halves of FFT-multiplier vectors, that is the upper pad_len/2 doubles:
-		memset(a+(pad_len>>1), 0, (pad_len<<2));
-		memset(b+(pad_len>>1), 0, (pad_len<<2));
-
-		// Forward FFT of A/B-vector [in a] and C/D vector [in b]:
-		pairFFT_mul(  a,b,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-
-		/************ debug-dump egcd-mults: *****************/
-		if(0) {
-			fp = fopen("c.txt",file_access_mode[vec_len > 16000]);	ASSERT(HERE, fp != 0x0, "Null file pointer!");
-			fprintf(fp,"vec_len = %u\n",vec_len);
-			fprintf(fp,"Leading u,v-elts:\n");
-			for(i = vec_len; i > vec_len-10; i--) {
-				fprintf(fp,"%5u\t%20llu\t%20llu\n",i,u[i],v[i]);
-			}
-			fprintf(fp,"egcd-mults:\n");
-			for(i = 0; i < sec_len; i++) {
-				fprintf(fp,"%5u\t%20llu\t%20llu\t%20llu\t%20llu\n",i,ai[i],bi[i],ci[i],di[i]);
-			}
-			fprintf(fp,"fFFt[a,b]:\n");
-			for(i = 0; i < fft_len; i+=2) {
-				j = i + ( (i >> DAT_BITS) << PAD_BITS );
-				fprintf(fp,"%5u\t%12.3f\t%12.3f\t%12.3f\t%12.3f\n",i,a[j],a[j+1],b[j],b[j+1]);
-			}
-			fclose(fp); fp = 0x0;
-			exit(0);
-		}
-		/*****************************************************/
-
-		// [a*u, b*v], [c*u, d*v] separately computed for now:
-
-		// Input vecs to FFT-gcd must be suitably 0-padded at top:
-		i = vec_len;	j = sec_len - (i & (sec_len-1));	// mod-via-AND here assumes sec_len a power of 2
-		ASSERT(HERE, mi64_iszero(u+i,j) && mi64_iszero(v+i,j), "Input vecs to FFT-gcd must be suitably 0-padded at top!");
-
-	/********* 3rd-lower block beneath the highest one - this has the vector-carryins in the hi result half: **********/
-		kk = vec_len-4*sec_len;	// Can try adding a few guard words to our block-start indices
-		// Only really need to clear hi half since lo half overwritten by mi64_cvt_uint64_double call:
-		memset(c, 0, (pad_len<<3));
-		// Process length = sec_len section starting at index [kk]:
-		printf("Processing length-%u section starting at index %u:\n",sec_len,kk);
-		cy_fwd = mi64_cvt_uint64_double(u+kk,v+kk, 0, sec_len, c);	printf("cy_fwd = %d\n",cy_fwd);
-		// Forward FFT of U/V-vector section [in c]:
-		pairFFT_mul(  c,0,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-		// d = copy of 0-padded u/v data stored in fFFTed form in c:
-		memcpy(d, c, (pad_len<<3));
-		// Do the [a*u, b*v] FFT-mul; the subtract-and-store-result-in-Re-part occurs in the carry step:
-		pairFFT_mul(  c,a,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2 (indicates both inputs already fFFTed)
-		// Do the [c*u, d*v] FFT-mul:
-		pairFFT_mul(  d,b,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2
-		// Interleave the [a*u-b*v, c*u-d*v] outputs; c*u-d*v go into odd slots of C-array:
-		for(i = 0; i < fft_len; i+=2) {
-			j = i + ( (i >> DAT_BITS) << PAD_BITS );
-			c[j] *= sign_mult[*sign_ab];	c[j+1] = sign_mult[*sign_cd]*d[j];
-		}
-		cy = mi64_cvt_double_uint64(c,fft_len, x,y);
-
-	/********* 2nd-lower block beneath the highest one - this has the vector-carryins in the hi result half: **********/
-		kk = vec_len-3*sec_len;	// Can try adding a few guard words to our block-start indices
-		// Only really need to clear hi half since lo half overwritten by mi64_cvt_uint64_double call:
-		memset(c, 0, (pad_len<<3));
-		// Process length = sec_len section starting at index [kk]:
-		printf("Processing length-%u section starting at index %u:\n",sec_len,kk);
-		cy_fwd = mi64_cvt_uint64_double(u+kk,v+kk, cy_fwd, sec_len, c);	printf("cy_fwd = %d\n",cy_fwd);
-		// Forward FFT of U/V-vector section [in c]:
-		pairFFT_mul(  c,0,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-		// d = copy of 0-padded u/v data stored in fFFTed form in c:
-		memcpy(d, c, (pad_len<<3));
-		// Do the [a*u, b*v] FFT-mul; the subtract-and-store-result-in-Re-part occurs in the carry step:
-		pairFFT_mul(  c,a,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2 (indicates both inputs already fFFTed)
-		// Do the [c*u, d*v] FFT-mul:
-		pairFFT_mul(  d,b,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2
-		// Interleave the [a*u-b*v, c*u-d*v] outputs; c*u-d*v go into odd slots of C-array:
-		for(i = 0; i < fft_len; i+=2) {
-			j = i + ( (i >> DAT_BITS) << PAD_BITS );
-			c[j] *= sign_mult[*sign_ab];	c[j+1] = sign_mult[*sign_cd]*d[j];
-		}
-		cy = mi64_cvt_double_uint64(c,fft_len, w,z);
-		cy_re = mi64_add(w,x+sec_len,w, sec_len);
-		cy_im = mi64_add(z,y+sec_len,z, sec_len);
-		printf("\tAfter previous-block vector-carry add: cy_re,im = %lld,%lld ... ignoring.\n",cy_re,cy_im);
-
-	/********* Penultimate block: *************************************/
-		kk = vec_len-2*sec_len;
-		// Only really need to clear hi half since lo half overwritten by mi64_cvt_uint64_double call:
-		memset(c, 0, (pad_len<<3));
-		// Process length = sec_len leading section:
-		printf("Processing length-%u section starting at index %u, cy_fwd = %u:\n",sec_len,kk,cy_fwd);
-		cy_fwd = mi64_cvt_uint64_double(u+kk,v+kk, cy_fwd, sec_len, c);	//ASSERT(HERE, cy_fwd == 0, "Unexpected carryout!");***if this were uppermost block
-		cy1 = cy_fwd & 1ull; cy2 = (cy_fwd>>1) & 1ull;	// Save these for later low-64-bit check
-		// Forward FFT of U/V-vector section [in c]:
-		pairFFT_mul(  c,0,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-		// d = copy of 0-padded u/v data stored in fFFTed form in c:
-		memcpy(d, c, (pad_len<<3));
-		// Do the [a*u, b*v] FFT-mul; the subtract-and-store-result-in-Re-part occurs in the carry step:
-		pairFFT_mul(  c,a,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2 (indicates both inputs already fFFTed)
-		// Do the [c*u, d*v] FFT-mul:
-		pairFFT_mul(  d,b,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2
-		// Interleave the [a*u-b*v, c*u-d*v] outputs; c*u-d*v go into odd slots of C-array:
-		for(i = 0; i < fft_len; i+=2) {
-			j = i + ( (i >> DAT_BITS) << PAD_BITS );
-			c[j] *= sign_mult[*sign_ab];	c[j+1] = sign_mult[*sign_cd]*d[j];
-		}
-		// Convert c/d to uint64, yielding a pair of length-(2*sec_len) vectors x,y plus borrow-outs, expected = 0 here:
-		cy = mi64_cvt_double_uint64(c,fft_len, x,y);	//ASSERT(HERE, cy==0, "Unexpected carryout!");***if this were uppermost block
-		cy_re = mi64_add(x,w+sec_len,w+sec_len, sec_len);
-		cy_im = mi64_add(y,z+sec_len,z+sec_len, sec_len);
-		printf("\tAfter previous-block vector-carry add: cy_re,im = %lld,%lld ... ignoring.\n",cy_re,cy_im);
-
-	/********* Uppermost block: *************************************/
-		// Expect result of same FFT-mul-computed [a*u-b*v, c*u-d*v] for uppermost block to exactly
-		// cancel upper half of penultimate-block lin-combo output, except possibly for low 64 bits,
-		// which we cheaply compute via integer arithmetic:
-		kk = vec_len-sec_len;
-		mi64_add_scalar(u+kk,cy1, tvec0,check_len);
-		mi64_add_scalar(v+kk,cy2, tvec1,check_len);
-	// Compute w[2*sec_len+i] = x[sec_len] + SGN(ai[i]*(u[kk+i]+cy1) - bi[i]*(v[kk+i]+cy2), sign_ab) :
-		mi64_mul_vector_lo_half(ai,tvec0, tvec2,check_len);	// ai[i]*(u[kk+i]+cy1)
-		mi64_mul_vector_lo_half(bi,tvec1, tvec3,check_len);	// bi[i]*(v[kk+i]+cy2)
-		if(*sign_ab)
-			mi64_sub(tvec3,tvec2, tvec2,check_len);	// SGN(ai[i]*(u[kk+i]+cy1) - bi[i]*(v[kk+i]+cy2), sgn_ab)
-		else
-			mi64_sub(tvec2,tvec3, tvec2,check_len);
-		mi64_add(x+sec_len,tvec2, w+2*sec_len,check_len);
-	// Compute z[2*sec_len+i] = y[sec_len] + SGN(ci[i]*(u[kk+i]+cy1) - di[i]*(v[kk+i]+cy2), sign_cd) :
-		mi64_mul_vector_lo_half(ci,tvec0, tvec2,check_len);	// ci[i]*(u[kk+i]+cy1)
-		mi64_mul_vector_lo_half(di,tvec1, tvec3,check_len);	// di[i]*(v[kk+i]+cy2)
-		if(*sign_cd)
-			mi64_sub(tvec3,tvec2, tvec2,check_len);	// SGN(ci[i]*(u[kk+i]+cy1) - di[i]*(v[kk+i]+cy2), sgn_cd)
-		else
-			mi64_sub(tvec2,tvec3, tvec2,check_len);
-		mi64_add(y+sec_len,tvec2, z+2*sec_len,check_len);
-	/* Single-low-word version of above:
-		w[2*sec_len] = x[sec_len] + SGN(ai[0]*(u[kk]+cy1) - bi[0]*(v[kk]+cy2), sign_ab);
-		z[2*sec_len] = y[sec_len] + SGN(ci[0]*(u[kk]+cy1) - di[0]*(v[kk]+cy2), sign_cd);
-	*/
-		// At most the lowest 'one-beyond' result word should be nonzero:
-		ASSERT(HERE, mi64_iszero(w+2*sec_len+1,check_len-1) && mi64_iszero(z+2*sec_len+1,check_len-1),"At most the lowest 'one-beyond' result word should be nonzero!");
-	if(0) {
-		fp = fopen("c.txt","w");	ASSERT(HERE, fp != 0x0, "Null file pointer!");
-		fprintf(fp,"Uppermost block of %u FFT data:\n",fft_len);
-		fprintf(fp,"\t\ta*u - b*v\t\tc*u - d*v:\n");
-	//	for(i = 0; i < fft_len; i+=2) {
-		for(i = 0; i < 2*sec_len+1; i++) {
-		//	j = i + ( (i >> DAT_BITS) << PAD_BITS );
-		//	fprintf(fp,"%5u\t%12.3f\t%12.3f\n",i,c[j],c[j+1]);
-			fprintf(fp,"%5u\t%20llu\t%20llu\n",i+vec_len-3*sec_len,w[i],z[i]);
-		}
-		fclose(fp); fp = 0x0;
-		exit(0);
-	}
-
-/*
-Now that have 2*sec_len[+1] leading good words of the full-length u',v' result vectors, compute a fresh
-length = sec_len set of abcd-linear-combination-reduction multipliers for those:
-
-...And now compute the product of the two 2x2 multiplier arrays:
-
-	/aa bb\     /a0 b0\     /a1 b1\     /a0*a1 + b0*c1  a0*b1 + b0*d1\
-   |       | = |       | * |       | = |                              |
-	\cc dd/     \c0 d0/     \c1 d1/     \c0*a1 + d0*c1  c0*b1 + d0*d1/ .
-
-To do this, we need fwd-FFTs of the vector pairs [a0,b0], [c0,d0], [a1,c1], [b1,d1].
-We already have the first 2 pairwise-fwd-FFTs of [a0,b0] and [c0,d0] in our floating-point a,b-vectors.
-*/
-		j = 2*sec_len+1;
-		i = mi64_getlen(w, j);	j = mi64_getlen(v, j);	j = MAX(i,j);
-		// Return value is length of abcd-multipliers:
-		j = mi64_gcd(w,z, j, eGCD=TRUE,ai,bi,len_ab,sign_ab, HALF=TRUE,ci,di,len_cd,sign_cd, sec_len);
-		ASSERT(HERE, j == sec_len, "Unexpected retval from mi64_gcd call to get abcd-multipliers!");
-	#if 1
-		if(!*sign_ab) { printf("a*u-b*v > 0\n"); } else if(*sign_ab == 1) { printf("a*u-b*v < 0\n"); } else { ASSERT(HERE, 0, "foo!"); }
-		if(!*sign_cd) { printf("c*u-d*v > 0\n"); } else if(*sign_cd == 1) { printf("c*u-d*v < 0\n"); } else { ASSERT(HERE, 0, "bar!"); }
-	#endif
-		// Verify that resulting abcd-multipliers have at most (sec_len) set words:
-		i = sec_len;
-		ASSERT(HERE, ai[i] == 0ull && bi[i] == 0ull && ci[i] == 0ull && di[i] == 0ull, "abcd-multipliers exceed expected length!");
-		// Convert a1+I*c1 and b1+I*d1 - note b/c swap vs abcd0-computation - to packed-double form:
-		cy = mi64_cvt_uint64_double(ai,ci,0,sec_len, c);
-		ASSERT(HERE, cy == 0, "FFT_gcd: unexpected carryout of mi64_cvt_uint64_double!");
-		cy = mi64_cvt_uint64_double(bi,di,0,sec_len, d);
-		ASSERT(HERE, cy == 0, "FFT_gcd: unexpected carryout of mi64_cvt_uint64_double!");
-		// Zero upper halves of FFT-multiplier vectors, that is the upper pad_len/2 doubles:
-		memset(c+(pad_len>>1), 0, (pad_len<<2));
-		memset(d+(pad_len>>1), 0, (pad_len<<2));
-		// Because the genfftmul_carry_norm_pow2_errcheck() macro is set up to compute difference of Re,Im=parts
-		// of FFT=mul outputs (as needed for a*u-b*v, c*u-d*v GCD-reduction muls), need to negate Im-parts here
-		// to -c1 and -d1 to yield the desired RHS sums in the above-described 2x2 matrix product:
-		for(i = 0; i < pad_len; i+=2) {	// Save ourselves indexing work by negating both data & pads here
-			c[i+1] = -c[i+1];	d[i+1] = -d[i+1];
-		}
-		// Forward FFT of a1/-c1-vector [in c] and b1/-d1 vector [in d]:
-		pairFFT_mul(  c,d,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-		// Stash a copy of fFFT(c) in next-higher [pad_len]-sized section:
-		memcpy(c+pad_len, c, (pad_len<<3));
-		// Do the [a0*a1,-b0*c1] FFT-mul; the subtract-and-store-[a0*a1 + b0*c1]-result-in-Re-part occurs in the carry step:
-		pairFFT_mul(  c,a,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2 (indicates both inputs already fFFTed)
-		// Do the [a0*b1,-b0*d1] FFT-mul, yielding bb = [a0*b1 + b0*d1]; DONE WITH A-ARRAY AFTERWARD, thus store result in it:
-		pairFFT_mul(  a,d,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2
-		// Interleave the [aa,bb] = [a0*a1 + b0*c1, a0*b1 + b0*d1] outputs, put into A-array:
-		for(i = 0; i < pad_len; i+=2) {
-			// Again do pads and data both; Must move the bb-data from odd to even A-slots first:
-			a[i] = a[i+1];	a[i+1] = c[i];
-		}
-		// C-array now free again, but already stashed copy of fFFT(c) in next-higher [pad_len]-sized section,
-		// so just do FFT-muls directly with that:
-		// Do the [b0*a1,-d0*c1] FFT-mul, yielding cc = [b0*a1 + d0*c1]:
-		pairFFT_mul(c+pad_len,b,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2 (indicates both inputs already fFFTed)
-		// Do the [b0*b1,-d0*d1] FFT-mul, yielding dd = [a0*b1 + b0*d1]; DONE WITH B-ARRAY AFTERWARD, thus store result in it:
-		pairFFT_mul(  b,d,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2
-		// Interleave the [cc,dd] = [b0*a1 + d0*c1, a0*b1 + b0*d1] outputs, put into B-array:
-		for(i = 0; i < pad_len; i+=2) {
-			// Again do pads and data both; Must move the dd-data from odd to even B-slots first:
-			b[i] = b[i+1];	b[i+1] = d[i];
-		}
-#warning to-do! *** cvt a,b back to int and use those 2*sec_len - long ints to do int-mul to test resulting gcd-reduction ***
-		// Lastly, need to 0-pad the now-doubled-length mult-arrays and compute fFFTs of those:
-		memset(a+pad_len, 0, (pad_len<<3));
-		memset(b+pad_len, 0, (pad_len<<3));
-		fft_len <<= 1;	pad_len <<= 1;	// Now double both FFT-len params
-		pairFFT_mul(  a,b,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-
-	retval += sec_len;	// = 2*sec_len
-	return retval;	// Will return actual multiplier length, once have code to compute it
-}
-
-/***********************/
-
-/*
-FFT-mul-accelerated Lehmer-type Euclidean GCD(U, V), where U,V are multiword ints stored
-in base-2^64 positive-digit form; U and V are assumed to both have at least (ndim)
-allocated 64-bit words, with U[0],V[0] being the least-significant words.
-
-RETURNS: The GCD in base-2^64 form in both U and V, and the length of the GCD
-		(in terms of # of significant 64-bit words) in the function value.
-
-		If the boolean EGCD is set on entry, an extended GCD is performed and
-		pointers to the (absolute values of the) A and B-array multipliers are
-		returned in the array pointers Ap, Bp, with corresponding signs indicated
-		via the low 2 bits of the returned sign_AB variable.
-		These array multipliers are defined such that A*U + B*V = GCD(U, V).
-
-EFFECTS: If both inputs are nonzero, the ORIGINAL INPUT VECTOR CONTENTS ARE DESTROYED
-		during the computation, and a copy of the GCD returned in each vector.
-
-		If one or both inputs = 0, neither input is altered, and the function
-		return value is = 0 by convention to signal that this is the case.
-		In order to match mathematical convention, the user should treat the
-		GCD as 0 only if both U and V = 0, and equal to max(U, V)
-		(i.e. the nonzero vector among U and V) if only one of the pair = 0.
-
-CALLS:	mi64_gcd, both [1] in EGCD+HALF mode to generate the smallest FFT-sized
-		multiword ABCD multipliers, and [2] for the final GCD computation once the operands
-		have been reduced in size sufficiently for FFT-mul to lose its effectiveness.
-*/
-uint32	fft_gcd(
-	uint64 u[], uint64 v[], const uint32 ndim,
-	const uint32 EGCD, uint64 Ap[], uint64 Bp[], uint32 *len_AB, uint32 *sign_AB)
-{
-	uint32 i,j,kk,bw,cy,cy_fwd, vec_len, eGCD = FALSE, HALF = FALSE, len_ab,sign_ab, len_cd,sign_cd;
-	static uint32 sec_len, fft_len,pad_len, first_entry=TRUE;
-	 int64 cy_re,cy_im;	// Prefer these to be signed
-	uint64 tmp64 = 0,cy1,cy2;
-	static uint64 *w = 0x0, *x = 0x0, *y = 0x0, *z = 0x0, *ai = 0x0, *bi = 0x0, *ci = 0x0, *di = 0x0;
-	static double *a = 0x0, *b = 0x0, *c = 0x0, *d = 0x0;
-	const double sign_mult[2] = {1.0,-1.0};
-	ASSERT(HERE, IS_ODD(u[0]) && IS_ODD(v[0]), "Even inputs not yet supported!");
-
-	if(first_entry) {
-		fprintf(stderr,"fft_gcd alloc with ndim = %u\n",ndim);
-		ASSERT(HERE, w == 0x0, "alloc-related sentinel pointer non-null!");
-		first_entry=FALSE;
-		/* Allocate the main data arrays: */
-		sec_len = FFTMUL_THRESHOLD_BITS>>2;
-	i = ndim + 2*sec_len;	//*** make large enough to hold mi64_mul_vector results if debugging-to-file
-//	i = sec_len<<1;
-		x  = (uint64 *)calloc(i, sizeof(uint64));
-		y  = (uint64 *)calloc(i, sizeof(uint64));
-		w  = (uint64 *)calloc(i, sizeof(uint64));
-		z  = (uint64 *)calloc(i, sizeof(uint64));
-		ai = (uint64 *)calloc(i, sizeof(uint64));
-		bi = (uint64 *)calloc(i, sizeof(uint64));
-		ci = (uint64 *)calloc(i, sizeof(uint64));
-		di = (uint64 *)calloc(i, sizeof(uint64));
-		// For now just use 16 bits per double:
-		fft_len = 4*sec_len;
-		fft_len *= 2;	/* Extra multiply by 2 here because mi64_cvt_uint64_double returns *complex* FFT length
-						that results from odd/even interleaving and int64-to-real conversion of the 2 input arrays.
-						Thus, we now have the real-vector FFT length, BUT NOT YET THE ZERO-PADDED LENGTH. */
-		pad_len = fft_len + ( (fft_len >> DAT_BITS) << PAD_BITS );	/* Padded-array length */
-		fft_len *= 2;	pad_len *= 2;	// Both of these now reflect real-vector FFT length *with* 0-padding
-		fprintf(stderr,"fft_gcd double-array allocs with pad_len = %u\n",pad_len);
-		a = (double *)calloc(  pad_len, sizeof(double));
-		b = (double *)calloc(  pad_len, sizeof(double));
-		c = (double *)calloc(2*pad_len, sizeof(double));	fprintf(stderr,"Temporary: C-array gets 2x alloc to allow high-half to be used for scratch storage.\n");
-		d = (double *)calloc(  pad_len, sizeof(double));
-		ASSERT(HERE, !ARRAYS_OVERLAP(a,pad_len, b,pad_len), "FFT_gcd: a/b-arrays overlap!");
-		// Call the init-FFT routines, using the (currently unused) c,d-arrays for the needed scratch space:
-		pairFFT_mul(  c,d,0, fft_len, TRUE , FALSE);	// INIT_ARRAYS = TRUE , FORWARD_FFT_ONLY = FALSE
-	}
-
-	/* Test for EGCD: */
-	if(EGCD) ASSERT(HERE, Ap != 0ull && Bp != 0ull && len_AB != 0ull && sign_AB != 0ull, "One or more of the EGCD-required return-pointers null!");
-
-	i = mi64_getlen(u, ndim);	j = mi64_getlen(v, ndim);	vec_len = MAX(i,j);
-	while(vec_len >= 2*sec_len)
-	{
-		// Input vecs to FFT-gcd must be suitably 0-padded at top:
-		i = vec_len;	j = sec_len - (i & (sec_len-1));	// mod-via-AND here assumes sec_len a power of 2
-		mi64_clear(u+i,j);	mi64_clear(v+i,j);
-#warning May need to clear more words for half-length reduction!
-
-		j = 2*sec_len+10;	// Adding 10 guard words to half-exit EGCD inputs to ensure accuracy of au-bv,cu-dv signs
-		// Copy leading ? bits of input-vecs into a pair of uint64-arrays. 16 bits per double ==> 64-bit vec,
-		// but need 2x as many bits in leadU,V as we will use in ensuing FFT-muls, hence the 2*sec_len:
-		mi64_set_eq(w,u+vec_len-j,j);	// w = high j words of u
-		mi64_set_eq(z,v+vec_len-j,j);	// z = high j words of v
-		// Call recursive routine to obtain vector FFT-multipliers for up-to-half-length reduction of [u,v] vectors:
-		// Return value is length of abcd-multipliers:
-		uint32 targ_len = vec_len>>1;
-		fft_gcd_get_mults(u,v,vec_len, targ_len,sec_len, ai,bi,ci,di, w,x,y,z, a,b,c,d, &len_ab,&len_cd,&sign_ab,&sign_cd);
-
-		/************ debug-dump egcd-mults: *****************/
-		if(0) {
-			fp = fopen("a.txt",file_access_mode[vec_len > 16000]);	ASSERT(HERE, fp != 0x0, "Null file pointer!");
-			fprintf(fp,"vec_len = %u\n",vec_len);
-			fprintf(fp,"Leading u,v-elts:\n");
-			for(i = vec_len; i > vec_len-10; i--) {
-				fprintf(fp,"%5u\t%20llu\t%20llu\n",i,u[i],v[i]);
-			}
-			fprintf(fp,"egcd-mults:\n");
-			for(i = 0; i < sec_len; i++) {
-				fprintf(fp,"%5u\t%20llu\t%20llu\t%20llu\t%20llu\n",i,ai[i],bi[i],ci[i],di[i]);
-			}
-			fprintf(fp,"fFFt[a,b]:\n");
-			for(i = 0; i < fft_len; i+=2) {
-				j = i + ( (i >> DAT_BITS) << PAD_BITS );
-				fprintf(fp,"%5u\t%12.3f\t%12.3f\t%12.3f\t%12.3f\n",i,a[j],a[j+1],b[j],b[j+1]);
-			}
-			fclose(fp); fp = 0x0;
-			exit(0);
-		}
-		/*****************************************************/
-
-		// [a*u, b*v], [c*u, d*v] separately computed for now:
-
-		// Convert abcd-multiplier-sized sections of u+I*v to packed-double form, result ==> double-array C.
-		// Unlike the above two mi64_cvt_uint64_double calls, we expect the possibility of
-		// carryouts here because we are [uint64 -> balanced-digit-double] converting sections
-		// of a pair of larger vectors, of which only the most-significant such section must have MSW > 0:
-
-		/**************** pure-int a*u - b*v: ****************/
-		if(fft_gcd_debug) {
-			fp = fopen("a.txt",file_access_mode[vec_len > 16000]);	ASSERT(HERE, fp != 0x0, "Null file pointer!");
-		// Compute a*u - b*v:
-			i = vec_len;	j = sec_len;
-			mi64_clear(x,i+j);	mi64_clear(w,i+j);	mi64_clear(z,i+j);	// w,z will hold intermediate products a*u, b*v
-			mi64_mul_vector(u,i, ai,j, w,&kk);	fprintf(fp,"A*U has %u words = %20llu,%20llu,...\n",kk,w[kk-1],w[kk-2]);
-			mi64_mul_vector(v,i, bi,j, z,&kk);	fprintf(fp,"B*V has %u words = %20llu,%20llu,...\n",kk,z[kk-1],z[kk-2]);
-			for(i = kk-1; (int)i > 0; i--)
-				if(w[i] != z[i]) break;
-			for(j = i-1; (int)j > 0; j--)
-				if(w[j] != (uint64)-1 || z[j] != 0ull) break;
-			fprintf(fp,"First diff at word %5u, next at %5u:\n",i,j);
-			fprintf(fp,"A*U[diff1-2] = %20llu,%20llu,%20llu,...,%20llu\n",w[i],w[i-1],w[i-2], w[j]);
-			fprintf(fp,"B*V[diff1-2] = %20llu,%20llu,%20llu,...,%20llu\n",z[i],z[i-1],z[i-2], z[j]);
-			// |a*u - b*v| result into x:
-		if(!sign_ab) { tmp64 = mi64_sub(w,z, x, vec_len+sec_len); } else { tmp64 = mi64_sub(z,w, x, vec_len+sec_len); }
-			fprintf(fp,"|A*U - B*V| has borrow = %lld:\n",tmp64);	ASSERT(HERE, tmp64 == 0ull,"|A*U - B*V| has nonzero borrow!");
-		// Compute c*u - d*v:
-			i = vec_len;	j = sec_len;
-			mi64_clear(y,i+j);	mi64_clear(w,i+j);	mi64_clear(z,i+j);	// w,z will hold intermediate products d*u, d*v
-			mi64_mul_vector(u,i, ci,j, w,&kk);	fprintf(fp,"C*U has %u words = %20llu,%20llu,...\n",kk,w[kk-1],w[kk-2]);
-			mi64_mul_vector(v,i, di,j, z,&kk);	fprintf(fp,"D*V has %u words = %20llu,%20llu,...\n",kk,z[kk-1],z[kk-2]);
-			for(i = kk-1; (int)i > 0; i--)
-				if(w[i] != z[i]) break;
-			for(j = i-1; (int)j > 0; j--)
-				if(w[j] != (uint64)-1 || z[j] != 0ull) break;
-			fprintf(fp,"First diff at word %5u, next at %5u:\n",i,j);
-			fprintf(fp,"C*U[diff1-2] = %20llu,%20llu,%20llu,...,%20llu\n",w[i],w[i-1],w[i-2], w[j]);
-			fprintf(fp,"D*V[diff1-2] = %20llu,%20llu,%20llu,...,%20llu\n",z[i],z[i-1],z[i-2], z[j]);
-			// c*u - d*v result into y:
-		if(!sign_cd) { tmp64 = mi64_sub(w,z, y, vec_len+sec_len); } else { tmp64 = mi64_sub(z,w, y, vec_len+sec_len); }
-			fprintf(fp,"|C*U - D*V| has borrow = %lld:\n",tmp64);	ASSERT(HERE, tmp64 == 0ull,"|C*U - D*V| has nonzero borrow!");
-		
-			i = mi64_getlen(x, vec_len+sec_len);	fprintf(fp,"A*U - B*V has %u nonzero words\n",i);
-			j = mi64_getlen(y, vec_len+sec_len);	fprintf(fp,"C*U - D*V has %u nonzero words\n",j);
-			kk = MAX(i,j);
-			fprintf(fp,"INT-mul [a*u - b*v, c*u - d*v] for vec_len = %u:\n",vec_len);
-			for(i = 0; i < kk; i++) {
-				fprintf(fp,"%5u\t%20llu\t%20llu\n",i,x[i],y[i]);
-			}
-			fclose(fp); fp = 0x0;
-		exit(0);
-		}
-		/*****************************************************/
-
-		cy_fwd = 0;		// This cy-out-of-the-untransformed-vector-section must be preserved for next block
-		bw = cy = 0;	// These are to manage the 2-blocks-down forwarding of borrows out of the calls to
-						// mi64_cvt_double_uint64 which convert signed FFT-mul-linear-combo data back to uint64[].
-		cy_re = cy_im = 0ull;
-		for(kk = 0; kk < vec_len; kk += sec_len) {
-			// Only really need to clear hi half since lo half overwritten by mi64_cvt_uint64_double call:
-			memset(c, 0, (pad_len<<3));
-			// Process length = sec_len section starting at index [kk]:
-			cy1 = cy_fwd & 1ull; cy2 = (cy_fwd>>1) & 1ull;//	printf("kk = %u: cy_re,im = %lld,%lld, cy_fwd = %u\n",kk,cy_re,cy_im,cy_fwd);
-			cy_fwd = mi64_cvt_uint64_double(u+kk,v+kk,cy_fwd, sec_len, c);
-		//	printf("\tlo64 check1: au = %llu; bv = %llu; cu = %llu; dv = %llu\n",ai[0]*(u[kk]+cy1),bi[0]*(v[kk]+cy2),ci[0]*(u[kk]+cy1),di[0]*(v[kk]+cy2));
-			// Forward FFT of U/V-vector section [in c]:
-			pairFFT_mul(  c,0,0, fft_len, FALSE, TRUE );	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE
-			// d = copy of 0-padded u/v data stored in fFFTed form in c:
-			memcpy(d, c, (pad_len<<3));
-			// Do the [a*u, b*v] FFT-mul; the subtract-and-store-result-in-Re-part occurs in the carry step:
-			pairFFT_mul(  c,a,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2 (indicates both inputs already fFFTed)
-			// Do the [c*u, d*v] FFT-mul:
-			pairFFT_mul(  d,b,0, fft_len, FALSE, 2*TRUE);	// INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = 2
-			// Interleave the [a*u-b*v, c*u-d*v] outputs; c*u-d*v go into odd slots of C-array:
-			for(i = 0; i < fft_len; i+=2) {
-				j = i + ( (i >> DAT_BITS) << PAD_BITS );
-				c[j] *= sign_mult[sign_ab];	c[j+1] = sign_mult[sign_cd]*d[j];
-			}
-			// Convert c/d to uint64, yielding a pair of length-(2*sec_len) vectors x,y plus borrow-outs.
-			// This conversion should only ever yield borrows (if anything), never carries, i.e. each of
-			// the 2-bit fields encoding cy/bw-outs should == 0 or 3, hence acceptable values for cy = 0,3,12,15:
-			bw = cy;	// Save pvs pass value of cy (really a borrow), since we need to preserve these across 2 blocks
-						// After 1 loop pass, may have a nonzero cy ... thus after 2 passes we may have a nonzero bw.
-			cy = mi64_cvt_double_uint64(c,fft_len, x,y);//	printf("\tmi64_cvt_double_uint64: cy_lo,hi = %u,%u\n",cy&3,cy>>2);
-			if(cy)
-				ASSERT(HERE, cy==3 || cy==12 || cy==15, "Unexpected carryout!");
-		//	printf("\tlo64 check2: x0 = %llu; y0 = %llu\n",x[0],y[0]);
-			ASSERT(HERE, SGN(x[0],sign_ab) == ai[0]*(u[kk]+cy1) - bi[0]*(v[kk]+cy2) && SGN(y[0],sign_cd) == ci[0]*(u[kk]+cy1) - di[0]*(v[kk]+cy2), "lo64 check fails!");
-			// Lower halves of x,y directly overwrite the current u,v-sections:
-			mi64_set_eq(u+kk,x,sec_len);	mi64_set_eq(v+kk,y,sec_len);
-			// Upper halves of x,y from previous loop execution (stored in w,z) plus borrowouts from
-			// mi64_cvt_double_uint64 conversion of those pvs x,y-vectors added to post-multiply sections of u,v:
-			if(kk > 0) {
-			#if 1	// Dumb no-op approach:
-				// Will propagate cyouts into low words of upper halves of x,y when we copy latter into w,z below:
-				cy_re = mi64_add(w,u+kk,u+kk, sec_len);
-				cy_im = mi64_add(z,v+kk,v+kk, sec_len);
-			//	printf("\tAfter previous-block vector-carry add: cy_re,im = %lld,%lld ... ignoring.\n",cy_re,cy_im);
-			#else
-				if(cy_re < 0) {
-					cy_re = mi64_add(w,u+kk,u+kk, sec_len);				// I think we can ignore these if past the new reduced u,v lengths:
-					tmp64 = mi64_sub_scalar(u+kk, 1ull, u+kk, sec_len);	if(tmp64 != 0ull) printf("INFO: Nonzero borrow = -%lld out of u--\n",tmp64);
-				} else
-					cy_re = mi64_add_cyin(w,u+kk,u+kk, sec_len, cy_re);
-
-				if(cy_im < 0) {
-					cy_im = mi64_add(z,v+kk,v+kk, sec_len);				// ... but may need handling otherwise. For now, just report:
-					tmp64 = mi64_sub_scalar(v+kk, 1ull, v+kk, sec_len);	if(tmp64 != 0ull) printf("INFO: Nonzero borrow = -%lld out of u--\n",tmp64);
-				} else
-					cy_im = mi64_add_cyin(z,v+kk,v+kk, sec_len, cy_im);
-				printf("\tAfter previous-block vector-carry add: cy_re,im = %lld,%lld\n",cy_re,cy_im);
-			#endif
-			}
-			// Upper halves of x,y will get added to low halves of [a*u-bv, c*u-b*v] output for *NEXT* pair of
-			// u,v-sections (which have not yet been linearly combined, i.e. must wait until next loop pass),
-			// so now that have processed previous pair of vector-addends, save those in w,z for later addition;
-			mi64_set_eq(w,x+sec_len,sec_len);
-			mi64_set_eq(z,y+sec_len,sec_len);
-		#if 1	// Dumb no-op approach:
-			cy_re = cy_im = 0ull;
-		#else
-			if(cy_re < 0) {
-				cy_re = mi64_sub_scalar(w,cy_re, w, sec_len);
-				if(cy_re != 0ull) printf("\tNonzero bw = %lld out of w--!\n", ABS(cy_re));
-			} else {
-				cy_re = mi64_add_scalar(w,cy_re, w, sec_len);
-				if(cy_re != 0ull) printf("\tNonzero cy = %lld out of w++!\n", ABS(cy_re));
-			}
-			if(cy_im < 0) {
-				cy_im = mi64_sub_scalar(z,cy_im, z, sec_len);
-				if(cy_im != 0ull) printf("\tNonzero bw = %lld out of z--!\n", ABS(cy_im));
-			} else {
-				cy_im = mi64_add_scalar(z,cy_im, z, sec_len);
-				if(cy_im != 0ull) printf("\tNonzero cy = %lld out of z++!\n", ABS(cy_im));
-			}
-			printf("\tAfter w,z incr/decr: cy_re,im = %lld,%lld\n",cy_re,cy_im);
-			if(bw) {
-				if(sign_ab == 0) cy_re -= ((bw & 3) != 0);	// Only apply borrow if a*u-b*v > 0
-				if(sign_cd == 0) cy_im -= ((bw >> 2) != 0);	// Only apply borrow if c*u-d*v > 0
-				printf("\tsign_mask*bw = %d*%d,%d*%d ==> cy_re,im = %lld,%lld\n",(sign_ab == 0),-((bw & 3) != 0),(sign_cd == 0),-((bw >> 2) != 0),cy_re,cy_im);
-			}
-		#endif
-		}	// endfor(kk = 0; kk < vec_len; kk += sec_len)
-		/**************** FFT-mul a*u - b*v: ****************/
-		if(fft_gcd_debug) {
-			fp = fopen("c.txt",file_access_mode[vec_len > 16000]);	ASSERT(HERE, fp != 0x0, "Null file pointer!");
-			fprintf(fp,"FFT-mul [a*u - b*v, c*u - d*v] for vec_len = %u:\n",vec_len);
-			for(i = 0; i < vec_len; i++) {
-				fprintf(fp,"%5u\t%20llu\t%20llu\n",i,u[i],v[i]);
-			}
-			fclose(fp); fp = 0x0;
-		//	exit(0);
-		}
-		/*****************************************************/
-		ASSERT(HERE, cy_fwd == 0, "Overall u,v-vectors must be > 0!");
-		// For the final block, we *expect* both carries (and all elts of the w,z-vectors) to == -1;
-		// the real sign of interest in this case is encoded in the high-order elements of u and v:
-		kk -= sec_len;
-		cy_re = u[kk-1];	ASSERT(HERE, ABS(cy_re) < 2, "Unexpected Re-carryout!");
-		cy_im = v[kk-1];	ASSERT(HERE, ABS(cy_im) < 2, "Unexpected Im-carryout!");
-	//	printf("\tOutput signs: cy_re,im = %lld,%lld\n",cy_re,cy_im);
-		// If either (a*u - b*v) or (c*u - d*v) result < 0, negate it - to account for multiword uint64[] carry
-		// chain here do as in matrix_vector_product_sub - logical negation and reinjection of -cy = +1 at bottom:
-		if(cy_re == -1) {
-			ASSERT(HERE, 0, "Post-mul negation should not be needed!");
-			for(j = 0; j < kk; j++)
-				u[j] = ~u[j];	/* Ones' complement */
-			// Reinject -cy at bottom, exit as soon as ripple carry peters out:
-			for(j = 0; j < kk; j++)
-				if(++u[j] != 0) break;
-		}
-		// Im-part carryout in low 2 bits of cy:
-		if(cy_im == -1) {
-			ASSERT(HERE, 0, "Post-mul negation should not be needed!");
-			for(j = 0; j < kk; j++)
-				v[j] = ~v[j];	/* Ones' complement */
-			// Reinject -cy at bottom, exit as soon as ripple carry peters out:
-			for(j = 0; j < kk; j++)
-				if(++v[j] != 0) break;
-		}
-		// Now compute updated vector lengths:
-		i = mi64_getlen(u, kk);		j = mi64_getlen(v, kk);		vec_len = MAX(i,j);
-	//	printf("\tFinal words: u[%u],v[%u] = %llu,%llu\n",i,j,u[i-1],v[j-1]);
-	printf("New vec_len = %u",vec_len);
-//	printf("\n");
-	}	// endwhile(vec_len >= 2*sec_len)
-
-	j = mi64_gcd(u, v, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0);
-	if(EGCD) {
-		ASSERT(HERE, 0, "EGCD not yet supported by FFT-gcd!");
-	//**** Do final FFTmul of cleanup-mi64_gcd-produced AB multipliers with ones accumulated in FFT-gcd passes ****
-	}
-	return j;
-}
-
-/*
-Lehmer-type Euclidean GCD(U, V), where U and V are two multiword integers stored
-in base-2^64 positive-digit form; U and V are assumed to both have at least (ndim)
-allocated 64-bit words, with U[0],V[0] being the least-significant words.
-
-RETURNS: The GCD in base-2^64 form in both U and V, and the length of the GCD
-		(in terms of # of significant 64-bit words) in the function value.
-
-		If the boolean EGCD is set on entry, an extended GCD is performed.
-		For simplicity (e.g. so we can use the mi64 functions to manipulate) use uint64 arrays to
-		store magnitudes of EGCD multiplier-vectors, separately track the corresponding sign bits.
-		Upon completion, pointers to the (absolute values of the) A and B-array multipliers are
-		returned in the arglist arrays Ap, Bp, with corresponding signs indicated
-		via the low 2 bits of the returned sign_AB variable.
-		These array multipliers are defined such that A*U + B*V = GCD(U, V).
-
-		If EGCD is set and the boolean HALF is also set on entry, an extended GCD
-		is performed but 'only halfway', in the sense that we return as soon as
-		the four multipliers A,B,C,D are all >= len_targ words long ('Half' refers to
-		my original thought that this should be >= half the input length; somewhat of
-		a misnomer for the final version, in which the user specifies the targeted
-		length reduction of the partial-GCD).
-			In this case, intended to generate multiword intermediate multipliers for
-		use by an FFT-mul-accelerated reduction-via-linear-combination scheme (which
-		in a further recursive framework yields a true subquadratic GCD algorithm), we
-		also return the C and D-array multipliers in the arglist arrays Cp, Dp, with
-		corresponding signs in the low 2 bits of sign_CD. If EGCD = HALF = true, the
-		return value is the length of the largest ABCD-multiplier vectors.
-		If EGCD = 0, HALF is ignored.
-
-EFFECTS: If both inputs are nonzero, the ORIGINAL INPUT VECTOR CONTENTS ARE DESTROYED
-		during the computation, and a copy of the GCD returned in each vector.
-
-		If one or both inputs = 0, neither input is altered, and the function
-		return value is = 0 by convention to signal that this is the case.
-		In order to match mathematical convention, the user should treat the
-		GCD as 0 only if both U and V = 0, and equal to max(U, V)
-		(i.e. the nonzero vector among U and V) if only one of the pair = 0.
-*/
-uint32	mi64_gcd(
-	uint64 u[], uint64 v[], uint32 const ndim,
-	const uint32 EGCD, uint64 Ap[], uint64 Bp[], uint32 *len_AB, uint32 *sign_AB,
-	const uint32 HALF, uint64 Cp[], uint64 Dp[], uint32 *len_CD, uint32 *sign_CD, const uint32 len_targ)
-{
-#if GCD_DEBUG>=1
-	/* stuff needed for known-divisor check: */
-	static uint64 *div = 0x0, *rem = 0x0;
-#endif
-	static uint64 *quo = 0x0;	// Use this in non-debug mode, as well
-/* Set max. scalar multiplier bits = 63 if 128-bit integer multiply available; otherwise set = 52. */
-#if USE_FLOATING_MULH64
-	const int max_bits = 52;
-#else
-	const int max_bits = 63;
-#endif
-
-	int i,istart,j,k,len_diff;
-	uint32 j1,j2,k1 = 0,k2 = 1,lz0,lz1,tz0,tz1,min_trailing_zeros=0,nshift,egcd_sign_swap = 0;
-	uint64 *uv_ptr[2];
-	uint64 *x;
-	uint64 iop,lead_u[2],lead_v[2],coeff[2][2],diff[2],range[2][2];
-	uint64 abmul[2],cdmul[2],max_abcd,t1,t2,t3,t4,lo,hi, uv0[2];
-	const double rbase = pow(2.0, 64), log10_base = 19.26591972249479649367928926;	/* 2^64, log10(2^64) */
-
-	double num,den,frac;
-	uint32 pass=0, lenU, lenV;	/* Note that lenFoo stores the NUMBER OF SIGNIFICANT ELEMENTS of vector Foo
-								(i.e. leading element is in Foo[lenFoo-1]), not the # of allocated elements! */
-	uint32 tmp32;
-	static uint32 ndim_save = 0;/* Stores the current allocation for the multiplier arrays
-								A,B,C,D used for extended GCD. Only used if EGCD = TRUE. */
-	uint32 len_abcd;
-	static uint64 *A = 0x0,*B = 0x0,*C = 0x0,*D = 0x0;	/* Multiplier arrays for extended GCD.
-								For simplicity (e.g. so we can use the mi64 functions to manipulate) use uint64s
-								to store the magnitudes, and separately track the corresponding sign bits... */
-	uint32 sign_abcd;	/* ...in the low 2 bits of this variable (i.e. the 0-bit of sign_abcd
-						stores sign(A) and the 1-bit of sign_abcd stores sign(C). Since each of
-						B and D must have the opposite sign of A and C, respectively, we need not
-						explicitly store the latter two signs. */
-	/* Set up for easily-permutable access to the (vector) elements
-	of the 2x2 eGCD multiplier matrix, in columnwise fashion: */
-	uint64 *ac_ptr[2];
-	uint64 *bd_ptr[2];
-
-	ASSERT(HERE, ndim != 0,"ndim == 0");
-	ASSERT(HERE,!mi64_iszero(u, ndim),"U-input 0");
-	ASSERT(HERE,!mi64_iszero(v, ndim),"V-input 0");
-
-	quo = (uint64 *)calloc(ndim, sizeof(uint64));
-#if GCD_DEBUG>=1
-	/* known divisor, quotient, remainder vectors needed for mi64_div check: */
-	div = (uint64 *)calloc(ndim, sizeof(uint64));
-	rem = (uint64 *)calloc(ndim, sizeof(uint64));
-	// To check divisibility-by-known-divisor, set divisor above, then insert the following code (with the #if restored) as desired:
-	#if 0//GCD_DEBUG>=1
-	div[0] =  4235226679561594903ull;
-	div[1] =    94004235929829273ull;
-	if(gcd_debug) {
-		mi64_div(uv_ptr[k2],div,quo,rem,lenU);
-		ASSERT(HERE, mi64_iszero(rem, lenU), "divisor check fails!");
-	}
-	#endif
-#endif
-#if GCD_DEBUG>=1
-	if(gcd_debug) {
-		printf("Inputs to GCD are:\n");
-		printf("                         U                   V  \n");
-
-		// Print format uses the PARI/GP arithmetic operator precedence rules, under the
-		// assumption that we may be checking the code outputs using that program:
-		printf("u=v=0;\n");
-		for(i=0; i<ndim; i++)
-			printf("i = %u; u+=%20llu<<(i<<6); v+=%20llu<<(i<<6);\n",i,u[i],v[i]);	// For every i++, shift count += 64
-		printf("\n");
-	}
-#endif
-
-	/* Test for EGCD: */
-	if(EGCD) {
-		ASSERT(HERE, Ap != 0ull && Bp != 0ull && len_AB != 0ull && sign_AB != 0ull, "One or more of the EGCD-required return-pointers null!");
-	  if(HALF)
-		ASSERT(HERE, Cp != 0ull && Dp != 0ull && len_CD != 0ull && sign_CD != 0ull, "One or more of the HALF-required return-pointers null!");
-
-		if(ndim_save < ndim) {
-			// If mul-array allocation not yet performed or insufficient, allocate to hold (ndim) uint64s:
-			free((void *)A); A = (uint64 *)calloc(ndim, sizeof(uint64));
-			free((void *)B); B = (uint64 *)calloc(ndim, sizeof(uint64));
-			free((void *)C); C = (uint64 *)calloc(ndim, sizeof(uint64));
-			free((void *)D); D = (uint64 *)calloc(ndim, sizeof(uint64));
-			ndim_save = ndim;
-		//	fprintf(stderr,"mi64_gcd alloc with ndim = %u\n",ndim);
-		}
-		memset(A,0,ndim<<3);	memset(B,0,ndim<<3);	memset(C,0,ndim<<3);	memset(D,0,ndim<<3);
-
-		/* Initialize:
-
-			(  A  -B )   ( 1  0 )
-			(        ) = (      )
-			( -C   D )   ( 0  1 ) .
-
-		We update the vectors forming the elements of this 2x2 multiplier matrix
-		such that any stage of the computation, we have
-
-			( u )   (  A  -B ) ( U )
-			(   ) = (        )*(   )
-			( v )   ( -C   D ) ( V ) , where U and V are the original input vectors (not stored).
-		*/
-		len_abcd  = 1;
-		/* We separately store the magnitudes and signs of the elements of the A/B/C/D matrix -
-		the initial magnitudes are those of the 2x2 identity-matrix elements, i.e. 1/0/0/1;
-		the initial signs are A/B = +/- and C/D = -/+, we store sgn(A) in bit 0, sgn(B) in bit 1,
-		thus A/C = +- maps to bit pattern 10 in the low 2 bits of sign_abcd: */
-		sign_abcd = 0x2;
-		A[0] = D[0] = 1;
-		B[0] = C[0] = 0;
-	}
-
-	/* Init the pointers to the U/V arrays: */
-	uv_ptr[0] = u;	uv_ptr[1] = v;
-
-	/* Init the pointers to the A/B/C/D arrays: */
-	ac_ptr[0] = A;	bd_ptr[0] = B;
-	ac_ptr[1] = C;	bd_ptr[1] = D;
-	uv0[0] = u[0];	uv0[1] = v[0];	// Save low limb of each input for EGCD sign determination and sanity check
-
-FIND_LARGER:
-
-	/* k1 and k2 index into the uv_ptr array - uv_ptr[k1] points to larger of u and v: */
-	istart = ndim-1;
-	lenU = 0;	/* This allows us to determine whether lenU has already been set on a previous pass in the ensuing loop. */
-	for(i=istart; i>=0; i--) {
-		if((u[i] != 0 || v[i] != 0) && lenU == 0)
-			lenU = i+1;
-		/* Even if we've already found the leading nonzero element,
-		in order to properly set k1 and k2 continue until find a differing element:
-		*/
-		if(u[i] == v[i])
-			continue;
-		else {
-			k1 = (u[i] < v[i]);
-			k2 = k1 ^ 0x00000001;
-			break;
-		}
-	}
-	/*	If U = V, both inputs identical:
-	*/
-	if(i == -1) {
-		printf("INFO: Identical inputs to MI64_GCD.\n");
-		lenV = lenU;	k1 = 0; k2 = 1;
-		goto GCD_DONE;
-	}
-
-	/* Otherwise, parse the smaller array until find leading nonzero element: */
-	x = uv_ptr[k2];
-	for(i=lenU-1; i>=0; i--) {
-		if(x[i] != 0) break;
-	}
-	lenV = i + 1;
-
-	/*	If lenV = 0, one of the inputs = 0, which is an error: */
-	if(lenV == 0)
-	{
-		fprintf(stderr, "ERROR: Zero input vector to MI64_GCD!\n");
-		ASSERT(HERE, 0,"0");
-	}
-
-	/* A small additional bit of preprocessing: count the trailing zeros in each vector and
-	right-shift both by the respective amounts, saving the smaller of the 2 shift counts
-	for output of GCD result as GCD(right-justified inputs) * 2^(initial right-shift count).
-	We need to make both inputs into the main loop odd here, since in the loop there's a
-	left-justify-the-smaller-of-the-two-vectors step which would otherwise lead to spurious
-	powers of 2 appearing in the final result.
-	*/
-
-#if 1
-	tz0 = mi64_trailz(uv_ptr[k1], lenU);
-	tz1 = mi64_trailz(uv_ptr[k2], lenV);
-	min_trailing_zeros = MIN(tz0,tz1);
-#else
-/**** 8/23/2005: disable for now, since this really complicates things when doing eGCD. ****/
-	/* Right-shift U: */
-	tz0 = mi64_trailz(uv_ptr[k1], lenU);
-	mi64_shrl(uv_ptr[k1], uv_ptr[k1], tz0, lenU);	lenU = mi64_getlen(uv_ptr[k1], lenU);
-
-	/* Right-shift V: */
-	tz1 = mi64_trailz(uv_ptr[k2], lenV);
-	mi64_shrl(uv_ptr[k2], uv_ptr[k2], tz1, lenV);	lenV = mi64_getlen(uv_ptr[k2], lenV);
-
-	/* 21 Aug 2005:
-	How does such a right-shift affect the eGCD multiplier arrays?
-	In doing an eGCD, at any stage of the computation we are assumed to have
-	(where U and V are the original input vectors, not stored)
-
-		( u )   (  A  -B ) ( U )
-		(   ) = (        )*(   )
-		( v )   ( -C   D ) ( V ) ,
-
-	So multiplying u or v by some constant is equivalent to multiplying both of A/B or C/D, respectively,
-	by the same constant. In the case of a right-shift of u or v, since we can't assume that each of
-	A/B or C/D will be divisible by the same power of 2, we instead LEFT-SHIFT the complement vector (v or u)
-	by the same number of places, accumulating these u/v shifts in separate counters so we can undo them
-	at the conclusion of the computation.
-	*/
-	if(EGCD) {
-		if     (tz0 > tz1) {
-			/* Check left-shift return value to see if len_abcd needs to be updated: */
-// 2005: ******************** start here - need to pad length by abs(tz0-tz1)/64 + 1 prior to shifting *********************************
-			mi64_shl(ac_ptr[k2], ac_ptr[k2], tz0-tz1, len_abcd);
-			mi64_shl(bd_ptr[k2], bd_ptr[k2], tz0-tz1, len_abcd);	len_abcd = MAX(mi64_getlen(ac_ptr[k2], len_abcd) , mi64_getlen(bd_ptr[k2], len_abcd));
-		} else if(tz0 < tz1) {
-			/* len_abcd will be automatically updated during the 2 shifts
-			so the final value corresponds to the larger of the 2 result vectors: */
-			mi64_shl(ac_ptr[k1], ac_ptr[k1], tz1-tz0, len_abcd);
-			mi64_shl(bd_ptr[k1], bd_ptr[k1], tz1-tz0, len_abcd);	len_abcd = MAX(mi64_getlen(ac_ptr[k1], len_abcd) , mi64_getlen(bd_ptr[k1], len_abcd));
-		}
-	}
-
-	/* If this is our second and final pass this way, make sure both operands are odd, i.e. tz0 = tz1 = 0: */
-	if(min_trailing_zeros != 0)
-		ASSERT(HERE, MAX(tz0,tz1) == 0,"gcd_lehmer.c: MAX(tz0,tz1) == 0");
-	else if(MIN(tz0,tz1) > 0) {
-		ASSERT(HERE, min_trailing_zeros == 0,"gcd_lehmer.c: min_trailing_zeros == 0");	/* Should only have to do this once. */
-		min_trailing_zeros = MIN(tz0,tz1);
-		goto FIND_LARGER;
-	}
-
-  #if GCD_DEBUG>=2
-	if(gcd_debug && (tz0 || tz1)) {
-		printf("Right-justified Inputs to GCD are:\n");
-		printf("                         U                   V  \n");
-		printf("u=v=0;\n");
-		for(i=0; i<lenU; i++)
-			printf("i = %u; u+=%20llu<<(i<<6); v+=%20llu<<(i<<6);\n",i,u[i],v[i]);	// For every i++, shift count += 64
-		printf("\n");
-	}
-  #endif
-
-#endif
-
-/*	Main loop is here.	*/
-
-#if GCD_DEBUG>=2
-if(gcd_debug) printf("PASS %d: lenU = %u, lenV = %u, k1,k2 = %d,%d\n",pass,lenU,lenV,k1,k2);
-#endif
-
-for(;;)	/* MAIN */
-{
-	pass++;
-
-#if 1
-	lz0 = mi64_leadz(uv_ptr[k1], lenU);
-	lz1 = mi64_leadz(uv_ptr[k2], lenV) + ((lenU - lenV)<<6);
-	ASSERT(HERE, lz0 <= lz1,"gcd_lehmer.c: lz0 > lz1!");
-	#if GCD_DEBUG>=2
-		if(gcd_debug) printf("lz_diff = %u bits\n",lz1-lz0);
-	#endif
-	if(lz1 - lz0 > 64)
-	{
-		sprintf(string0,"U (lz0 = %u bits) more than 64 bits larger than V (lz1 = %u bits) in MI64_GCD!\n",lz0,lz1);
-		WARN(HERE, string0, "", TRUE);
-	}
-#endif
-/**** 8/23/2005: disable for now, since this really complicates things when doing eGCD. ****/
-	/* In order to guarantee a nontrivial 2x2 scalar multiplier matrix,
-	make sure that U is no more than max_bits bits greater than V. in
-	fact, while we're at it, let's make the difference < max_bits/2: */
-#if 1
-	/* First, do any needed full-word-sized shifts: */
-	len_diff = lenU - lenV;
-	ASSERT(HERE, len_diff >= 0,"gcd_lehmer.c: len_diff >= 0");
-
-	if(len_diff > (lenU>>1))	// V less than 1/2 length of U
-	{
-		// Try division-with-remainder in large-size-disparity cases - this is very slow in terms of timing,
-		// but only because the current div implementation is bitwise-quadratic:
-		mi64_div(uv_ptr[k1],uv_ptr[k2],lenU,lenU,quo,uv_ptr[k1]);	// To-Do: Use quotient to adjuct the eGCD coeffs to reflect the div.
-		ASSERT(HERE, mi64_cmpult(uv_ptr[k1],uv_ptr[k2], lenU), "divisor check fails!");
-		j = mi64_getlen(uv_ptr[k1], lenU);
-		ASSERT(HERE, (j <= lenV) && mi64_cmpult(uv_ptr[k1],uv_ptr[k2],lenV),"gcd_lehmer.c: lenV == lenU");
-		lenU = lenV; lenV = j;
-		k1 ^= 0x00000001;
-		k2 ^= 0x00000001;
-		/* If lenV = 0, we're done. */
-		if(lenV == 0)
-			goto GCD_DONE;
-	}
-	else if(len_diff > 0)
-	{
-		/*	if lenU > lenV and leading digit(v) < leading digit(u),
-		left-shift V by len_diff places	*/
-		if(uv_ptr[k2][lenV-1] < uv_ptr[k1][lenU-1])
-		{
-		#if GCD_DEBUG>=1
-			if(gcd_debug) printf("pass %u , A: left-shifting V-array by %d words\n",pass,len_diff);
-		#endif
-			mi64_shl(uv_ptr[k2], uv_ptr[k2],  len_diff<<6, lenU);
-			lenV = mi64_getlen(uv_ptr[k2], lenU);
-			ASSERT(HERE, lenV == lenU,"gcd_lehmer.c: lenV == lenU");
-		}
-		/*	otherwise left-shift V by (len_diff - 1) places.	*/
-		else if(len_diff > 1)
-		{
-		#if GCD_DEBUG>=1
-			if(gcd_debug) printf("pass %u , B: left-shifting array %u by %d words\n",pass,k2,len_diff-1);
-		#endif
-			mi64_shl(uv_ptr[k2], uv_ptr[k2], (len_diff-1)<<6, lenU);
-			lenV = mi64_getlen(uv_ptr[k2], lenU);
-			ASSERT(HERE, lenV == lenU-1,"gcd_lehmer.c: lenV == lenU-1");
-		}
-
-		/*	Now that lenV is no less than lenU - 1, count leading zeros and
-		see if any less-than-full-word-sized supplemental shift is warranted:
-		*/
-		lz0 = mi64_leadz(uv_ptr[k1], lenU);
-		lz1 = mi64_leadz(uv_ptr[k2], lenV) + ((lenU - lenV)<<6);
-		j = lz1 - lz0;	ASSERT(HERE, j >= 0,"ERROR: lz0 > lz1 in MI64_GCD!");
-		if(j >= (max_bits>>1))
-		{
-			// If larger vector has any trailing zeros resulting from a previous such shift, right-justify it first:
-			i = trailz64(uv_ptr[k1][0]);
-			if(i) {
-				ASSERT(HERE, i < j, "Excessive trailing zeros in MI64_GCD!");
-				mi64_shrl(uv_ptr[k1], uv_ptr[k1], i, lenU);	lenU = mi64_getlen(uv_ptr[k1], lenU);
-			}
-		#if GCD_DEBUG>=1
-			if(gcd_debug) {
-				printf("pass %u, C: left-shifting vector %u by %u-%u-1 = %u bits;\n",pass,k2,j,i,j-i-1);
-			//	printf("Input  = %s\n",&str_10k[__convert_mi64_base10_char(str_10k, 10<<10, uv_ptr[k2], lenV, 0)]);
-			}
-		#endif
-			// Must use lenU as shift-operand length, since result may have a carry into next-higher word, which is discarded otherwise.
-			mi64_shl(uv_ptr[k2], uv_ptr[k2], j-i-1, lenU);	lenV = mi64_getlen(uv_ptr[k2], lenU);
-		}
-	}
-
-  #if GCD_DEBUG>=2
-	if(gcd_debug && pass>=0) {
-		printf("Left-justified Inputs to GCD are:\n");
-		printf("                         U                   V  \n");
-		printf("u=v=0;\n");
-		for(i=0; i<lenU; i++) {
-			printf("i = %u; u+=%20llu<<(i<<6); v+=%20llu<<(i<<6);\n",i,u[i],v[i]);	// For every i++, shift count += 64
-		}
-		printf("\n");
-	}
-  #endif
-
-#endif	/* #if(1) */
-
-	/*
-	Get leading 128 bits of U and V (starting with the MSB of max(U, V))
-	to form the startvalues for the Lehmer GCD:
-	*/
-	nshift = leadz64(uv_ptr[k1][lenU-1]);
-	ASSERT(HERE, nshift < 64,"gcd_lehmer.c: nshift < 64");
-
-	mi64_extract_lead128(uv_ptr[k1], lenU, nshift, lead_u);
-	mi64_extract_lead128(uv_ptr[k2], lenU, nshift, lead_v);
-
-#if 0	/****** OBSOLETE ******/
-	#error Do not use this code!
-
-	mvbits64(uv_ptr[k1][lenU-1],0        ,64-nshift,&range[0][1],nshift);	/* move nonzero bits of leading digit into high word (2) of 128-bit slot, left-justifying...*/
-	mvbits64(uv_ptr[k2][lenU-1],0        ,64-nshift,&range[1][1],nshift);
-
-  if(lenU > 1)
-  {
-	mvbits64(uv_ptr[k1][lenU-2],64-nshift,nshift   ,&range[0][1],0     );	/* if leading digit had any leftmost zero bits, fill low-order bits of high word with high bits of next digit...*/
-	mvbits64(uv_ptr[k2][lenU-2],64-nshift,nshift   ,&range[1][1],0     );
-
-	mvbits64(uv_ptr[k1][lenU-2],0        ,64-nshift,&range[0][0],nshift);	/* move leading bits of (lenU-1)st digit into low word (1) of 128-bit slot, left-justifying...*/
-	mvbits64(uv_ptr[k2][lenU-2],0        ,64-nshift,&range[1][0],nshift);
-  }
-
-  if(lenU > 2)
-  {
-	mvbits64(uv_ptr[k1][lenU-3],64-nshift,nshift   ,&range[0][0],0     );	/* if necessary, fill low-order bits of low word with high bits of (lenU-2)nd digit. */
-	mvbits64(uv_ptr[k2][lenU-3],64-nshift,nshift   ,&range[1][0],0     );
-  }
-#endif
-
-#if GCD_DEBUG>=2
-if(gcd_debug) {
-	printf("Indices into U/V = %u, %u\n",k1,k2);
-	printf("\n");
-	printf("NSHIFT = %u:\n", nshift);
-	printf("\n");
-	printf("leadU[1,0] = %20llu   %20llu\n",lead_u[1],lead_u[0]);
-	printf("leadV[1,0] = %20llu   %20llu\n",lead_v[1],lead_v[0]);
-	printf("\n");
-}
-else if(0)
-	printf("Indices into U/V = %u, %u\n",k1,k2);
-#endif
-
-/*	This is the loop that calculates the scalar multipliers for the Lehmer scheme: */
-
-	/*	Initialize 2 x 2 matrix coefficients, difference and range arrays:	*/
-
-	coeff[0][0] = coeff[1][1] = 1;		/* x,y multipliers here;		*/
-	coeff[0][1] = coeff[1][0] = 0;		/* init to 2x2 identity matrix. */
-	diff [0] = diff [1] = 1;	/* accumulated error here. */
-
-	range[0][0] = lead_u[0];	range[0][1] = lead_u[1];
-	range[1][0] = lead_v[0];	range[1][1] = lead_v[1];
-
-	i = 0;	/* initial row A index */
-	j = 1;	/* initial row B index */
-
-	for(;;)	/* CALC_MULTS */
-	{
-	/*	use FP approximation to diagonal terms and FDIV to form integer multiplier. */
-
-		/* calculate range + diff (128-bit sum), store in (t1,t2). */
-
-		t1 = range[j][0] + diff[j];
-		t2 = range[j][1] + (t1 < range[j][0]);	/* if (range[j,0] + diff[j] < range[j,0]), had a carry out of the 64-bit add. */
-
-		num = (double)range[i][1]*rbase + (double)range[i][0];
-		den = (double)t2         *rbase + (double)t1;
-
-		/* If zero denominator, break: */
-		if(den == 0) {
-		#if GCD_DEBUG>=2
-			if(gcd_debug) printf("EXIT oo\n");
-		#endif
-			break;	/* EXIT CALC_MULTS */
-		}
-
-		frac = num/den;
-		iop = (uint64)frac;	// Use FDIV to calculate FP approximate to ratio, then integer truncate upon store.
-	#if GCD_DEBUG>=2
-		if(gcd_debug) printf("frac = %20.15e, iop = %20llu\n",frac,iop);
-	#endif
-
-		// If FP result is just a smidge < 1, force = 1 to prevent premature loop exit for nearly-indeitcal 128-bit num,den values:
-		if(iop == 0 && frac > 0.999999999999999) {
-			iop = 1;
-			#if GCD_DEBUG>=2
-				if(gcd_debug) {
-				//	printf("frac = %20.15e, iop = %20llu\n",frac,iop);
-					printf("n1 = %20llu, n0 = %20llu\n",range[i][1],range[i][0]);
-					printf("d1 = %20llu, d0 = %20llu\n",t2,t1);
-				}
-			#endif
-		}
-
-		/* If ratio = 0 or is out of range of uint64, break: */
-		if(iop == 0) {
-		#if GCD_DEBUG>=2
-			if(gcd_debug) printf("EXIT 0\n");
-		#endif
-			break;	/* EXIT CALC_MULTS */
-		} else if(frac >= TWO63FLOAT) {	// Q: Should we require ratio to be in range of double-float mantissa instead?
-		#if GCD_DEBUG>=2
-			if(gcd_debug) printf("EXIT oo\n");
-		#endif
-			break;	/* EXIT CALC_MULTS */
-		}
-
-	/*	  Calculate diff[i] =  diff[i] + iop* diff(j), checking for integer overflow: */
-
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(iop,diff[j],&lo,&hi);
-	#else
-		MUL_LOHI64(iop,diff[j], lo, hi);
-	#endif
-		t3 = lo + diff[i];
-		/* If low part carries on add or high part>0, diff[i] + iop* diff(j) > 2^64 */
-		if((t3 < lo) || hi!=0) {
-		#if GCD_DEBUG>=2
-			if(gcd_debug) printf("EXIT A\n");
-		#endif
-			break;	/* EXIT CALC_MULTS */
-		}
-
-		diff[i] = t3;
-
-	/*
-	Similarly calculate coeff[i] = coeff[i] + iop*coeff(j). Since carries
-	in matrix-vector multiply routine are signed, must limit coefficients to be < 2^63.
-	For floating emulation of MULH64, limit coefficients to be < 2^52.
-	*/
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(iop,coeff[j][0],&lo,&hi);
-	#else
-		MUL_LOHI64(iop,coeff[j][0], lo, hi);
-	#endif
-		t3 = lo + coeff[i][0];
-		hi = hi + (t3<lo);
-		if(hi!=0 || (t3>>max_bits) != 0) {
-		#if GCD_DEBUG>=2
-			if(gcd_debug) printf("EXIT B\n");
-		#endif
-			break;	/* EXIT CALC_MULTS */
-		}
-
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(iop,coeff[j][1],&lo,&hi);
-	#else
-		MUL_LOHI64(iop,coeff[j][1], lo, hi);
-	#endif
-		t4 = lo + coeff[i][1];
-		hi = hi + (t4<lo);
-		if(hi!=0 || (t4>>max_bits) != 0) {
-		#if GCD_DEBUG>=2
-			if(gcd_debug) printf("EXIT C\n");
-		#endif
-			break;	/* EXIT CALC_MULTS */
-		}
-
-		coeff[i][0] = t3;
-		coeff[i][1] = t4;
-
-	/*	update row A of range:	*/
-
-		/* calculate iop*(range(j) + diff(j)). This is guaranteed not to exceed 128 bits. */
-
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(iop,t1,&t3,&t4);
-	#else
-		MUL_LOHI64(iop,t1, t3, t4);
-	#endif
-		t1 = t3;			/*  low part = MULL64(iop,lo)	*/
-		t2 = iop*t2 + t4;	/* high part = MULL64(iop,hi) + MULH64(iop,lo) */
-
-		/* range[i] = range[i] - iop*(range(j) + diff(j)). */
-
-		t1 = range[i][0] - t1;
-		range[i][1] = range[i][1] - t2 - (t1 > range[i][0]);	/* see if had a borrow. */
-		range[i][0] = t1;
-
-	/*	interchange row indices: */
-
-		k = i; i = j; j = k;
-
-#if GCD_DEBUG>=3
-if(gcd_debug) {
-	printf("\n");
-	printf("ABmul=%20llu   %20llu\n",coeff[0][0],coeff[0][1]);
-	printf("CDmul=%20llu   %20llu\n",coeff[1][0],coeff[1][1]);
-	printf("i,j = %d, %d\n",i,j);
-	printf("\n");
-}
-#endif
-	}	/* enddo CALC_MULTS */
-
-	ASSERT(HERE, i == (j^0x00000001),"gcd_lehmer.c: i == (j^0x00000001)");
-
-	max_abcd = 0;	/* Allows us to verify that our max. multiplier >= 1 - we allow equality
-					e.g. if our scalar-multiplier matrix has form
-
-							( a  -b )   ( 1   0 )
-							(       ) = (       )
-							( c  -d )   ( 1  -1 ) .
-
-					i.e. mainly ensure that we're not doing a trivial multiply by the identity matrix. */
-
-	abmul[k1] = coeff[i][0];	max_abcd = MAX(abmul[k1], max_abcd);	/* a is here */
-	abmul[k2] = coeff[i][1];	max_abcd = MAX(abmul[k2], max_abcd);	/* b is here */
-
-	cdmul[k1] = coeff[j][0];	max_abcd = MAX(cdmul[k1], max_abcd);	/* c is here */
-	cdmul[k2] = coeff[j][1];	max_abcd = MAX(cdmul[k2], max_abcd);	/* d is here */
-
-#if GCD_DEBUG >= 2
-if(gcd_debug) {
-	printf("a = %20llu; b = %20llu;\n", abmul[ 0], abmul[ 1]);
-	printf("c = %20llu; d = %20llu;\n", cdmul[ 0], cdmul[ 1]);
-}
-#endif
-
-	if(max_abcd <= 1) {
-		/* Make sure there are at least 3 nonzero matrix coefficients: */
-		if((abmul[k1]+abmul[k2] + cdmul[k1]+cdmul[k2]) < 3)
-		{
-			sprintf(string0, "2 or more zero matrix coefficients: %u, %u, %u, %u.\n", (uint32)abmul[k1], (uint32)abmul[k2], (uint32)cdmul[k1], (uint32)cdmul[k2]);
-			ASSERT(HERE, 0, string0);
-		}
-	}
-
-	/*	Now calculate the matrix-vector product
-
-	( u')	   ( a  -b ) ( u )	   ( |a*u - b*v| )
-	(   ) =	ABS(       )*(   )	== (             )
-	( v')	   ( c  -d ) ( v )	   ( |c*u - d*v| )
-	*/
-	tmp32 = matrix_vector_product_sub(abmul, cdmul, uv_ptr, lenU);
-
-	/* Bottom bit of return value indicates whether either result vector
-	had greater magnitude than the input vectors - should be 0 in this case: */
-	ASSERT(HERE, (tmp32&1) == 0, "gcd_lehmer: unexpected exit carry from matrix_vector_product_sub!");
-
-	/*
-	Store any sign-flip that may have occurred (e.g. the routine detected that
-	the full-length scalar*vector product a*u - b*v was actually < 0, and as a
-	result returned the inverted b*v - a*u in the u-vector slot) in j1 and j2:
-	*/
-	j1 = (tmp32>>(k1+1))&1;
-	j2 = (tmp32>>(k2+1))&1;
-
-#if GCD_DEBUG>=2
-/* Insert a nonzero pass number here to enable pass-specific debug: */
-if(gcd_debug && pass==0) {
-	printf("Result of matrix_vector_product:\n");
-	printf("                         U                   V  \n");
-	printf("u=v=0;\n");
-	for(i=0; i<lenU; i++)
-		printf("i = %u; u+=%20llu<<(i<<6); v+=%20llu<<(i<<6);\n",i,u[i],v[i]);	// For every i++, shift count += 64
-	printf("\n");
-}
-#endif
-
-	if(EGCD) {
-	#if 0/*GCD_DEBUG>=2*/
-		/* Compute the leading 128 bits of the
-		A' = (a*A + b*C), B' = (a*B + b*D), C' = (c*A + d*C), D' = (c*B + d*D)
-		products and the leading 256 bits of the resulting A'*u,B'*v,C'*u,D'*v terms.
-		Then repeat, but for the subtractive
-		A" = (a*A - b*C), B" = (a*B - b*D), C" = (c*A - d*C), D" = (c*B - d*D) terms.
-		This is to see if we want to + or - in updating the eGCD multipliers.
-		*/
-		if(gcd_debug) {
-			/* Leading 128 bits of A/B: */
-			if(mi64_cmpugt(A, B, len_abcd) {
-				tmp32 = mi64_getlen(A, len_abcd);
-				i = leadz64(A, tmp32);
-			} else {
-				tmp32 = mi64_getlen(B, len_abcd);
-				i = leadz64(B, tmp32);
-			}
-			mi64_extract_lead128(A, tmp32, i, a128);
-			mi64_extract_lead128(B, tmp32, i, b128);
-
-			/* Leading 128 bits of C/D: */
-			if(mi64_cmpugt(C, D, len_abcd) {
-				tmp32 = mi64_getlen(C, len_abcd);
-				i = leadz64(C, tmp32);
-			} else {
-				tmp32 = mi64_getlen(D, len_abcd);
-				i = leadz64(D, tmp32);
-			}
-			mi64_extract_lead128(C, tmp32, i, c128);
-			mi64_extract_lead128(D, tmp32, i, d128);
-
-			/* A' = (a*A + b*C)*/
-			mi64_mul_scalar(a128, ab_mul[0], x192, 2);
-			mi64_mul_scalar(c128, ab_mul[1], y192, 2);
-			mi64_add(x192,y192,x192);
-			ap128[0]=x192[1];	ap128[1]=x192[2];	/* extract leading 128 bits of sum */
-			/* B' = (a*B + b*D)*/
-			mi64_mul_scalar(b128, ab_mul[0], x192, 2);
-			mi64_mul_scalar(d128, ab_mul[1], y192, 2);
-			mi64_add(x192,y192,x192);
-			bp128[0]=x192[1];	bp128[1]=x192[2];	/* extract leading 128 bits of sum */
-			/* C' = (c*A + d*C)*/
-			mi64_mul_scalar(a128, cd_mul[0], x192, 2);
-			mi64_mul_scalar(c128, cd_mul[1], y192, 2);
-			mi64_add(x192,y192,x192);
-			cp128[0]=x192[1];	cp128[1]=x192[2];	/* extract leading 128 bits of sum */
-			/* D' = (c*B + d*D)*/
-			mi64_mul_scalar(b128, cd_mul[0], x192, 2);
-			mi64_mul_scalar(d128, cd_mul[1], y192, 2);
-			mi64_add(x192,y192,x192);
-			dp128[0]=x192[1];	dp128[1]=x192[2];	/* extract leading 128 bits of sum */
-
-			/* Now calculate & print the leading terms of |A'*u-B'*v|, |C'*u-D'*v|: */
-
-***************************************************************************
-/***No - this won't work, since we no longer have the original U and V!***/
-		}
-	#endif
-		/*
-		If doing an eGCD, update the 2 columns of the eGCD multiplier array similarly.
-		The basic computation is of the form
-
-			(  A' -B')   ( a  -b ) (  A  -B )   ( (a*A + b*C)  -(a*B + b*D) )
-			(        ) = (       )*(        ) = (                           )
-			( -C'  D')   ( c  -d ) ( -C   D )   ( (c*A + d*C)  -(c*B + d*D) )
-
-		The indices j1 and j2 tell us whether the elements of the (a, -b) and (c, -d)
-		pairs have been flipped (e.g. j1 = 1 means that the first row of our scalar-multipliers
-		matrix was ( -b  a ) rather than ( a  -b )), i.e. whether we need to flip the corresponding
-		sign bit of sign_abcd. Since the sign-flips indicated by j1 and j2 can be overridden post hoc
-		during the execution of matrix_vector_product_sub, we XOR the a priori sign-flip bits
-		(j1 +2*j2) with the post-hoc-override bits (stored in tmp32) to get the final flip-values:
-		*/
-		sign_abcd ^= (j1 + (j2 << 1));
-
-		/* The u/v-update call to matrix_vector_product_sub normally expects
-		a single sign flip in computing |a*u-b*v| and |c*u-d*v|, in which case
-		we call matrix_vector_product_add to update the eGCD multipliers. OTOH
-		if there were 0 or 2 sign flips in the u/v-update, call matrix_vector_product_sub
-		to update eGCD multipliers: */
-
-/*****HOW TO PROPERLY DEAL WITH CASE WHERE A*u-B*v or C*u-D*v = 0?******/
-
-		// No more need for this - found it easier to simply fix the needed signs at very end (cf. 'Nov 2015' note below):
-		ASSERT(HERE, egcd_sign_swap == 0, "egcd_sign_swap unexpectedly set!");
-		if(egcd_sign_swap) {
-			ASSERT(HERE, 0, "Bad code!");
-		#if GCD_DEBUG>=2
-			if(gcd_debug) {
-				printf("Pass = %u\n", pass);
-				printf("A/B eGCD multiplier vectors:\n");
-				if(sign_abcd & 0x1)
-					printf("                         -A                  +B  \n");
-				else
-					printf("                         +A                  -B  \n");
-
-				printf("a=b=0;\n");
-				for(i=0; i<len_abcd; i++)
-					printf("i = %u; a+=%20llu<<(i<<6); b+=%20llu<<(i<<6);\n",i,A[i],B[i]);
-				printf("\n");
-
-				printf("C/D eGCD multiplier vectors:\n");
-				if(sign_abcd & 0x2)
-					printf("                         -C                  +D  \n");
-				else
-					printf("                         +C                  -D  \n");
-
-				printf("c=d=0;\n");
-				for(i=0; i<len_abcd; i++)
-					printf("i = %u; c+=%20llu<<(i<<6); d+=%20llu<<(i<<6);\n",i,C[i],D[i]);
-				printf("\n");
-			}
-		#endif
-
-			/* Don't combine the 2 calls into an if(call1() || call2()),
-			since we need to be sure both function calls always occur: */
-			tmp32  = matrix_vector_product_sub(abmul, cdmul, ac_ptr, len_abcd);
-			tmp32 |= matrix_vector_product_sub(abmul, cdmul, bd_ptr, len_abcd);
-			len_abcd += (tmp32&1);
-		} else {
-			tmp32  = matrix_vector_product_add(abmul, cdmul, ac_ptr, len_abcd);
-			tmp32 |= matrix_vector_product_add(abmul, cdmul, bd_ptr, len_abcd);
-			len_abcd += (tmp32&1);
-		}
-
-		// Early-abort in case using eGCD to accumulate "chunks of yardage" for subsequent FFT-mul reduction:
-		if(HALF && len_abcd >= len_targ) goto HALF_EXIT;
-
-		/* Lookahead scheme? */
-		if(j1 ^ j2)
-			egcd_sign_swap = FALSE;
-		else
-			egcd_sign_swap = TRUE;
-
-	#if GCD_DEBUG>=2
-		if(gcd_debug) {
-			printf("A/B eGCD multiplier vectors, sign_abcd = %u:\n",sign_abcd);
-			printf("a=b=0;\n");
-			for(i=0; i<len_abcd; i++)
-				printf("i = %u; a+=%20llu<<(i<<6); b+=%20llu<<(i<<6);\n",i,A[i],B[i]);
-			printf("\n");
-
-			printf("C/D eGCD multiplier vectors:\n");
-			printf("c=d=0;\n");
-			for(i=0; i<len_abcd; i++)
-				printf("i = %u; c+=%20llu<<(i<<6); d+=%20llu<<(i<<6);\n",i,C[i],D[i]);
-			printf("\n");
-		}
-	#endif
-	}
-
-/**** 8/23/2005: disable for now, since this really complicates things when doing eGCD. ****/
-#if 0
-	uint32 shift_ab = 0, shift_cd = 0;
-	/* Right-justify U and V, one of which (but not both)
-	might be even as a result of the matrix_vector_product call.
-	Must do this to prevent accumulation of spurious powers of 2
-	in the gcd, i.e. if we see at the start of the next pass through
-	the main for-loop that V needs left-justifying, we need to ensure
-	that U is odd, otherwise the gcd will accumulate a spurious 2:
-	*/
-	/* lenU,B remain unchanged: */
-	tz0 = mi64_trailz(u, lenU);
-	if(tz0) {
-		mi64_shrl(u, u, tz0, lenU);
-shift_cd += tz0;
-		if(EGCD) {
-			/* Left-shift the complement-vector eGCD multipliers (C/D) by tz0 places: */
-			mi64_shl(ac_ptr[1], ac_ptr[1], tz0, len_abcd);
-			mi64_shl(bd_ptr[1], bd_ptr[1], tz0, len_abcd);
-			len_abcd = MAX(mi64_getlen(ac_ptr[1], len_abcd) , mi64_getlen(bd_ptr[1], len_abcd));
-		}
-	}
-
-	tz1 = mi64_trailz(v, lenU);
-	if(tz1) {
-		mi64_shrl(v, v, tz1, lenU);
-shift_ab += tz1;
-		if(EGCD) {
-			/* Left-shift the complement-vector eGCD multipliers by (A/B) tz1 places: */
-			mi64_shl(ac_ptr[0], ac_ptr[0], tz1, len_abcd);
-			mi64_shl(bd_ptr[0], bd_ptr[0], tz1, len_abcd);
-			len_abcd = MAX(mi64_getlen(ac_ptr[0], len_abcd) , mi64_getlen(bd_ptr[0], len_abcd));
-		}
-	}
-#endif
-
-	/*	set k1 to row index of larger element: */
-	istart = lenU-1;
-	lenU = 0;	/* This allows us to determine whether lenU has already been set on a previous pass in the ensuing loop. */
-	for(i=istart; i>=0; i--) {
-		if((u[i] != 0 || v[i] != 0) && lenU == 0)
-			lenU = i+1;
-		/* Even if we've already found the leading nonzero element,
-		in order to properly set k1 and k2 continue until find a differing element:
-		*/
-		if(u[i] == v[i])	//<*** On pass 2, get upper 2 slots = 0 as expected, but next 350 or so all have u[i] = v[i] = 14757395258967641292 ***
-			continue;		//That leads to u/v having leading 350 64-bit words identical, i.e. on next cal-mults pass expect a linear combo = u-v
-		else {				//in on of the slots, other should be u or v unchanged, i.e. scalar-mults matrix = 1  -1 / 1   0 .
-			k1 = (u[i] < v[i]);
-			k2 = k1 ^ 0x00000001;
-			break;
-		}
-	}
-	/*	If U = V, we're done, with a copy of a nontrivial GCD in each of the 2 vectors. */
-	if(i == -1) goto GCD_DONE;
-
-	/* Otherwise, parse the smaller array until find leading nonzero element: */
-	x = uv_ptr[k2];
-	for(i=lenU-1; i>=0; i--) {
-		if(x[i] != 0) break;
-	}
-	lenV = i + 1;
-	/* If lenV = 0, we're done. */
-	if(lenV == 0) goto GCD_DONE;
-
-#if GCD_DEBUG>=2
-if(gcd_debug || (pass&1023)==0) printf("PASS %d: lenU = %u, lenV = %u, k1,k2 = %d,%d\n",pass,lenU,lenV,k1,k2);
-#endif
-}	/* enddo MAIN	*/
-
-GCD_DONE:
-
-	/*	Print GCD: */
-#if GCD_DEBUG>=0
-	printf("GCD finished in %u passes.\n", pass);
-#endif
-	ASSERT(HERE, (lenU+lenV != 0), "mi_gcd: final U & V-vectors both zero-length!");
-
-	/* Put copies of the GCD into both uv_ptr[k1] and uv_ptr[k2]: */
-	for(i = 0; i < lenU; i++) {
-		uv_ptr[k2][i] = uv_ptr[k1][i];
-	}
-
-	/* Nov 2015:
-	Just compute the EGCD multiplies A,B,C,D sans signs through course of GCD, then at end simply
-	do a 64-bit low-half multiply of 0-words of A*u and B*v, whichever of these 2 64-bit products
-	is 1 greater than the other gets the + sign:
-	*/
-	if(EGCD) {
-		uv0[0] *= A[0];	uv0[1] *= B[0];
-		sign_abcd = (uv0[1] > uv0[0]);	// Result = index of whichever of A/B gets the + sign in GCD = +-A*u-+B*v :
-		ASSERT(HERE, uv0[sign_abcd] - uv0[sign_abcd ^ 1] == 1ull, "EGCD multiplier sanity check failed!");
-	}
-
-HALF_EXIT:
-
-	if(EGCD) {
-		mi64_set_eq(Ap,A,len_abcd); mi64_set_eq(Bp,B,len_abcd); *len_AB = len_abcd; *sign_AB = sign_abcd & 0x1;
-	  if(HALF) {
-		mi64_set_eq(Cp,C,len_abcd); mi64_set_eq(Dp,D,len_abcd); *len_CD = len_abcd; *sign_CD = sign_abcd >> 1;
-		ASSERT(HERE, len_abcd == len_targ, "HALF_EXIT: len_abcd != len_targ");
-		// The sign_abcd-setting in the code above doesn't work in the half-gcd case, so instead
-		// use signs of LSWs of inputs and outputs to deduce signs of a*u-b*v, c*u-d*v:
-		lo = Ap[0]*uv0[0] - Bp[0]*uv0[1]; if(lo == u[0]) { *sign_AB = 0; } else if(lo == -u[0]) { *sign_AB = 1; } else { ASSERT(HERE, 0, "foo!"); }
-		lo = Cp[0]*uv0[0] - Dp[0]*uv0[1]; if(lo == v[0]) { *sign_CD = 0; } else if(lo == -v[0]) { *sign_CD = 1; } else { ASSERT(HERE, 0, "bar!"); }
-		// If HALF = true, return value is the length of the largest of the ABCD-multiplier vectors.
-		return len_abcd;
-	  }
-	}
-
-#if GCD_DEBUG>=1
-	if(EGCD && gcd_debug) {
-		ASSERT(HERE, min_trailing_zeros == 0, "min_trailing_zeros != 0");
-
-		printf("A/B eGCD multiplier vectors:\n");
-		if(sign_abcd & 0x1)
-			printf("                         -A                  +B  \n");
-		else
-			printf("                         +A                  -B  \n");
-
-		printf("a=b=0;\n");
-		for(i=0; i<len_abcd; i++)
-			printf("i = %u; a+=%20llu<<(i<<6); b+=%20llu<<(i<<6);\n",i,A[i],B[i]);
-		printf("\n");
-
-		printf("C/D eGCD multiplier vectors:\n");
-		if(sign_abcd & 0x2)
-			printf("                         -C                  +D  \n");
-		else
-			printf("                         +C                  -D  \n");
-
-		printf("c=d=0;\n");
-		for(i=0; i<len_abcd; i++)
-			printf("i = %u; c+=%20llu<<(i<<6); d+=%20llu<<(i<<6);\n",i,C[i],D[i]);
-		printf("\n");
-
-		printf("GCD vectors are:\n");
-		printf("                         U                   V  \n");
-		printf("u=v=0;\n");
-		for(i=0; i<lenU; i++)
-			printf("i = %u; u+=%20llu<<(i<<6); v+=%20llu<<(i<<6);\n",i,u[i],v[i]);	// For every i++, shift count += 64
-		printf("\n");
-	}
-#endif
-
-	// Restore any common power of 2: *** Still need to generalize this to EGCD case ***
-	if(!EGCD && !HALF && min_trailing_zeros > 0) {
-		lenU += (min_trailing_zeros>>6)+1;
-		mi64_shl(uv_ptr[k1], uv_ptr[k1], min_trailing_zeros, lenU);
-		lenU = mi64_getlen(uv_ptr[k1], lenU);	/* 12/16/08: getlen() had lenV here. (?) */
-	}
-
-	// Skip GCD-printing if sending leading chunks of large vectors to eGCD in order
-	// to generate multiword abcd-multipliers for FFT-mul-speeded length reduction:
-	if(!HALF) {
-		/* Currently printing functions can only handle up to STR_MAX_LEN digits: */
-		if(lenU*log10_base >= STR_MAX_LEN) {
-			printf("GCD exceeds printable string length - printing in raw base-2^64 form, least-significant first:\n");
-			printf("GCD = 2^%u * [\n", min_trailing_zeros);
-			for(i = 0; i < lenU; i++)
-				printf("i = %u:  %20llu\n",i,uv_ptr[k1][i]);
-			printf("]\n");
-		} else {
-			printf("GCD = %s\n", &string0[convert_mi64_base10_char(string0, uv_ptr[k1], lenU, 0)]);
-		}
-	}
-	// Return value = number of 64-bit words needed to hold the GCD:
-	return lenU;
-}
-
-/************* Helper functions: *************/
-
-/*	Calculates the matrix-vector product
-
-		( u' )      ( a  -b ) ( u )
-		(    ) = ABS(       )*(   )
-		( v' )      ( c  -d ) ( v ) ,
-
-	where the absolute-value applies to the separate linear combinations resulting
-	from the matrix-vector multiply.
-
-	Here, a, b, c and d are unsigned 64-bit integers having magnitude <= 2^63 and
-	u and v are pointers to base-2^64 multiword integer arrays assumed to contain at most
-	(len) nonzero elements. The routine examines the inputs and decides how to order
-	the a*u/b*v and c*u/d*v scalr*vector multiplies so as to make the result of each
-	nonnegative - since the criterion it uses to do so examines only the leading 192
-	bits of each of the 4 aforementioned products it is not completely deterministic,
-	and there is postprocessing code to fix things up if the 192-bit ordering-guess
-	proves incorrect. (But in general we hope the 192-bit guess is correct most of the time.)
-
-	For each row-times-vector inner product defined by the above matrix-vector multiply, the
-	u'/v'-pointers point (in terms of the main fixed-address u/v data arrays defined in the
-	calling function) either to (u, v) or (v, u), as determined by the arguments ii1 for the
-	(a, -b) row and ii2 for the (c, -d) row multiplies, respectively. This mechanism is designed
-	so that via the above-described small amount of preprocessing, ii1 and ii2 can be set to
-	greatly enhance the odds) that each of the resulting row-times-vector inner products
-	will be nonnegative. In terms of the fixed-address u/v data arrays, the 4 possible values
-	of the (ii1, ii2) bit pair define 4 possibilities, as follows:
-
-							(  a  -b ) ( u )
-	(ii1, ii2) = (0, 0):	(        )*(   )
-							(  c  -d ) ( v )
-
-							(  a  -b ) ( u )
-	(ii1, ii2) = (0, 1):	(        )*(   )
-							( -d   c ) ( v )
-
-							( -b   a ) ( u )
-	(ii1, ii2) = (1, 0):	(        )*(   )
-							(  c  -d ) ( v )
-
-							( -b   a ) ( u )
-	(ii1, ii2) = (1, 1):	(        )*(   )
-							( -d   c ) ( v ) .
-
-	In our implementation below we define indices j1 = ii1, j2 = 1 - j1, j3 = ii2, j4 = 1 - j3
-	and access u and v via uv_ptr[0] = &u and uv_ptr[1] = &v, in terms of which our 2 output
-	vectors (which overwrite the inputs) are:
-
-		u" = abmul[j1]*( uv_ptr[j1] ) - abmul[j2]*( uv_ptr[j2] )
-
-		v" = cdmul[j3]*( uv_ptr[j3] ) - cdmul[j4]*( uv_ptr[j4] )
-
-	(the aforementioned calling-function preprocessing comes into play here, in that it is
-	desirable that the user has arranged the scalar multipliers a,b,c,d such that u" and v" >= 0.)
-
-	EFFECTS:
-			- Overwrites the input vectors with the result vectors.
-
-	RETURNS: the following result-value bits are meaningful:
-
-			<0:0> = 1 if either of the |a*u-b*v| or |c*u-d*v| linear combination
-			  had a nonzero output carry (which is placed in the respective u,v[len]
-			  vector slot), which should only occur when the routine is being called
-			  to update eGCD multipliers - in this case caller should increment (len).
-
-			  In the case where the routine is called to update GCD vectors (i.e.
-			  the outputs have size less than the inputs, leaves any needed zero-leading-
-			  element detection and subsequent vector-length-decrementing to the caller.
-
-			<1:2> = 1 if the respective a*u-b*v or c*u-d*v linear combination
-			  needed a negation in order to make the result nonnegative.
-*/
-uint32 matrix_vector_product_sub
-(
-	uint64c abmul[],	/* It is assumed here that abmul[0,1] multiply uv_ptr[0,1]! */
-	uint64c cdmul[],	/* It is assumed here that cdmul[0,1] multiply uv_ptr[0,1]! */
-	uint64 *uv_ptr[],
-	uint32  len
-)
-{
-	uint32 i, j1,j2,j3,j4, nshift, retval = 0;
-	sint64 cy1 = 0, cy2 = 0;	/* Carries */
-	uint64 cylo, hi1, hi2, lo1, lo2, lo;
-	int cyhi;
-	sint64 a, b, c, d;
-	uint64 *u, *v, *A, *B, *C, *D;
-	uint64 lead_u[2], lead_v[2];
-	uint64 val1, val2, val3, val4;
-
-	/* Using absolute indices here (rather than k1/k2) during u/v-update calls
-	eases later eGCD bookkeeping; */
-	u = uv_ptr[0];
-	v = uv_ptr[1];
-
-	/* Find larger of the 2 vectors in order to properly set shift
-	count prior to leading-bits extraction: */
-	if(mi64_cmpult(v, u, len))
-		nshift = leadz64(u[len-1]);	/* If U > V, use it to set shift count... */
-	else
-		nshift = leadz64(v[len-1]);	/* ...otherwise use V to set shift count. */
-
-	/*	Figure out which order to do the multiplies in (i.e. a*u - b*v or b*v - a*u)
-	to ensure (or at least maximize the chance, as O(len)-costly postprocessing is
-	needed otherwise) that result >= 0:
-	*/
-
-	/*	If leading 192 bits of a*u < b*v, swap a,b and the ptrs to u,v: */
-	mi64_extract_lead128(u,len, nshift, lead_u);	//	ASSERT(HERE, (lead_u[0]+lead_u[1]) != 0, "(lead_u[0]+lead_u[1]) == 0!");
-	mi64_extract_lead128(v,len, nshift, lead_v);	//	ASSERT(HERE, (lead_v[0]+lead_v[1]) != 0, "(lead_v[0]+lead_v[1]) == 0!");
-
-	j1 = 0;
-	if(CMP_LT_PROD192(abmul[0], lead_u[0], lead_u[1], abmul[1], lead_v[0], lead_v[1]))
-	{
-	#if GCD_DEBUG>=2
-		if(gcd_debug) printf("(a*u) < (b*v) : swapping these\n");
-	#endif
-		j1 ^= 0x00000001;
-		retval ^= 0x00000002;
-	}
-	j2 = j1^0x00000001;
-
-/*	Now do same for c and d: */
-
-	j3 = 0;
-	if(CMP_LT_PROD192(cdmul[0], lead_u[0], lead_u[1], cdmul[1], lead_v[0], lead_v[1]))
-	{
-	#if GCD_DEBUG>=2
-		if(gcd_debug) printf("(c*u) < (d*v) : swapping these\n");
-	#endif
-		j3 ^= 0x00000001;
-		retval ^= 0x00000004;
-	}
-	j4 = j3^0x00000001;
-
-	/*
-	Proceed with the pairwise-linear-combination step, writing results back into u/v:
-	*/
-	/* Check magnitudes of a,b,c,d < 2^63: */
-	a = abmul[j1];	b = abmul[j2];	c = cdmul[j3];	d = cdmul[j4];
-	ASSERT(HERE, a >= 0 && b >= 0 && c >= 0 && d >= 0,"matrix_vector_product_sub: a >= 0 && b >= 0 && c >= 0 && d >= 0");
-
-	A = uv_ptr[j1];	B = uv_ptr[j2];	C = uv_ptr[j3];	D = uv_ptr[j4];
-
-#ifndef YES_ASM	// Std-C version of the critical loop:
-
-	// 04 May 2012 x86_64 timing experiments: 2x loop-unroll was worse, rejiggering MULs as noted below helped:
-	for (i = 0; i < len; i++)
-	{
-		/* Make copies of the current 4 vector elts since some of these will get clobbered: */
-		val1 = A[i];
-		val2 = B[i];
-	#ifdef MUL_LOHI64_SUBROUTINE	// Moving the first pair of MULs from just below the val4 = to here gave a 7% speedup
-		MUL_LOHI64(a, val1,&lo1,&hi1);
-		MUL_LOHI64(b, val2,&lo2,&hi2);
-	#else
-		MUL_LOHI64(a, val1, lo1, hi1);
-		MUL_LOHI64(b, val2, lo2, hi2);
-	#endif
-		// These 2 are possibly-swapped copies to same data as val1,2, must init before overwriting u,v!
-		val3 = C[i];
-		val4 = D[i];
-
-	/* a*u[i] - b*v[i] */
-		/*
-		Compute cy1 + (hi1, lo1) - (hi2, lo2)
-			= (cyhi, cylo) - [(hi2, lo2) - (hi1, lo1)] .
-		Note cy1 is signed.
-		*/
-		cylo = (uint64)cy1;
-		cyhi = -(cy1 < 0);	/* Sign extension of cy1 */
-
-		lo = lo2 - lo1;
-		cyhi = cyhi - (cylo < lo) + (lo2 < lo1);
-		cy1 = (sint64)cyhi + (sint64)(hi1 - hi2);
-
-		u[i] = cylo - lo;
-
-/*	printf("difference = %lld + %lld *2^64\n", u[i], cy1);	*/
-
-	/* c*u[i] - d*v[i] */
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(c, val3,&lo1,&hi1);
-		MUL_LOHI64(d, val4,&lo2,&hi2);
-	#else
-		MUL_LOHI64(c, val3, lo1, hi1);
-		MUL_LOHI64(d, val4, lo2, hi2);
-	#endif
-		/*
-		Compute cy2 in same fashion as cy1 above:
-		*/
-		cylo = (uint64)cy2;
-		cyhi = -(cy2 < 0);	/* Sign extension of cy2 */
-
-		lo = lo2 - lo1;
-		cyhi = cyhi - (cylo < lo) + (lo2 < lo1);
-		cy2 = (sint64)cyhi + (sint64)(hi1 - hi2);
-		v[i] = cylo - lo;
-
-/*	printf("difference = %lld + %lld *2^64\n", v[i], cy2);	*/
-	} /* for i */
-
-#else	// YES_ASM = true: x86_64 inline-asm version of the critical loop:
-		// We get rid of most refs to var-signedness here, since at the machine-instruction level
-		// that comes about via choice of instructions (e.g. arithmetic vs logical right-shift),
-		// and only matters for a very few instructions.
-
-  #if 1
-	// Oct 2015: This inline-ASM version of the loop runs 25% faster than the C version on my Core2;
-	// The main gain came not from the loop-body (which I ASM-ized) first, but from the finishing
-	// touch of full-ASM of the loop control.
-	__asm__ volatile (\
-		/* 2 of our 4 vector-int addresses can be load-once, as these are not overwritten in the loop: */\
-		"movq	%[__A],%%r12	\n\t	movq	%[__B],%%r13	\n\t"\
-
-	"movslq	%[__len], %%rcx	\n\t"/* ASM loop structured as for(j = len; j != 0; --j){...} */\
-	"xorq	%%rsi, %%rsi	\n\t"/* Index into the 3 arrays (really a uint64-array pointer offset), init = 0 */\
-	"movq	%%rcx,%%rdi	\n\t"/* Copy of array-length in rdi */\
-	"1:					\n\t"\
-
-		/* Remaining 2 of our 4 vector-int addresses must be reloaded each loop pass: */\
-		"movq	%[__C],%%r14	\n\t	movq	%[__D],%%r15	\n\t"\
-		"movq	%[__cy1],%%r8 	\n\t	movq	%[__cy2],%%r9 	\n\t"/* Load cy1,2 */\
-		"movq	(%%r12,%%rsi),%%rax 	\n\t"/* val1 = A[i] */\
-		"mulq	%[__a]					\n\t"/* MUL_LOHI64(a, val1); lo1:hi1 in rax:rdx; CF (unused) set according to (hi == 0) */\
-		"movq	%%rax,%%rbx				\n\t"/* lo1 */\
-		"movq	%%rdx,%%rdi				\n\t"/* hi1 */\
-		"movq	(%%r13,%%rsi),%%rax 	\n\t"/* val2 = B[i] */\
-		"mulq	%[__b]					\n\t"/* MUL_LOHI64(b, val2); lo2:hi2 in rax:rdx */\
-
-		/* Load val3,4 before overwriting u,v[i] below */\
-		"movq	(%%r14,%%rsi),%%r14 	\n\t"/* val3 = C[i] */\
-		"movq	(%%r15,%%rsi),%%r15 	\n\t"/* val4 = D[i] */\
-
-		"movq	%%r8 ,%%r10				\n\t"/* cylo = cy1, cylo in r10 */\
-		"sarq	$63,%%r8 				\n\t"/* cy1 = -(cy1 < 0); Sign extension of cy1 */\
-		"subq	%%rbx,%%rax				\n\t"/* lo[rax] = lo2 - lo1 */\
-		"adcq	$0 ,%%r8 				\n\t"/* cy1 += Borrow out of (lo2 - lo1) */\
-		"subq	%%rax,%%r10				\n\t"/*        cylo - lo */\
-		"sbbq	$0 ,%%r8 				\n\t"/* cy1 -= Borrow out of (cylo - lo) */\
-		"movq	%[__u],%%r11			\n\t"\
-		"movq	%%r10,(%%r11,%%rsi)	\n\t"/* u[i] = cylo - lo */\
-		"addq	%%rdi,%%r8 				\n\t"/* cy1 += hi1 */\
-		"subq	%%rdx,%%r8 				\n\t"/* cy1 -= hi2 */\
-		"movq	%%r8 ,%[__cy1]		\n\t"/* Store cy1 */\
-
-		"movq	%%r14,%%rax 	\n\t"/* val3 = C[i] */\
-		"mulq	%[__c]					\n\t"/* MUL_LOHI64(c, val1); lo1:hi1 in rax:rdx */\
-		"movq	%%rax,%%rbx				\n\t"/* lo1 */\
-		"movq	%%rdx,%%rdi				\n\t"/* hi1 */\
-		"movq	%%r15,%%rax 	\n\t"/* val4 = D[i] */\
-		"mulq	%[__d]					\n\t"/* MUL_LOHI64(d, val2); lo2:hi2 in rax:rdx */\
-		"movq	%%r9 ,%%r10				\n\t"/* cylo = cy2, cylo in r10 */\
-		"sarq	$63,%%r9 				\n\t"/* cy2 = -(cy2 < 0); Sign extension of cy2 */\
-		"subq	%%rbx,%%rax				\n\t"/* lo[rax] = lo2 - lo1 */\
-		"adcq	$0 ,%%r9 				\n\t"/* cy2 += Borrow out of (lo2 - lo1) */\
-		"subq	%%rax,%%r10				\n\t"/*        cylo - lo */\
-		"sbbq	$0 ,%%r9 				\n\t"/* cy2 -= Borrow out of (cylo - lo) */\
-		"movq	%[__v],%%r11			\n\t"\
-		"movq	%%r10,(%%r11,%%rsi)	\n\t"/* v[i] = cylo - lo */\
-		"addq	%%rdi,%%r9 				\n\t"/* cy2 += hi1 */\
-		"subq	%%rdx,%%r9 				\n\t"/* cy2 -= hi2 */\
-		"movq	%%r9 ,%[__cy2]		\n\t"/* Store cy2 */\
-
-	"leaq	0x8(%%rsi), %%rsi	\n\t"/* Incr shared mem-offset */\
-	"decq	%%rcx \n\t"\
-	"jnz 1b 	\n\t"/* loop1 end; continue is via jump-back if rcx != 0 */\
-		: /* outputs: none */\
-		: /* All inputs from memory here (Trying to use mem/reg "g" gives 'invalid operand for instruction' errors) */\
-		  [__u] "m" (u), [__v] "m" (v)	\
-		 ,[__a] "m" (a), [__b] "m" (b), [__c] "m" (c), [__d] "m" (d)	\
-		 ,[__A] "m" (A), [__B] "m" (B), [__C] "m" (C), [__D] "m" (D)	\
-		 ,[__cy1] "m" (cy1), [__cy2] "m" (cy2)	\
-		 ,[__len] "m" (len), [__i] "m" (i)	\
-		: "cc","memory","rax","rbx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","r15"	/* Clobbered registers */\
-	);
-
-  #else
-
-	for (i = 0; i < len; i++)
-	{
-		val1 = A[i];
-		val2 = B[i];
-		MUL_LOHI64(a, val1, lo1, hi1);
-		MUL_LOHI64(b, val2, lo2, hi2);
-		// These 2 are possibly-swapped copies to same data as val1,2, must init before overwriting u,v!
-		val3 = C[i];
-		val4 = D[i];
-
-	/* a*u[i] - b*v[i] */
-		cylo = cy1;
-		cy1  = -(cy1 < 0);	/* Sign extension of cy1 */
-		lo   = lo2 - lo1;
-		cy1 += (lo2 < lo1);	// cy1 += Borrow out of (lo2 - lo1)
-		u[i] = cylo - lo;
-		cy1 -= (cylo < lo);	// cy1 -= Borrow out of (cylo - lo)
-		cy1 += hi1;
-		cy1 -= hi2;
-
-	/* c*u[i] - d*v[i] */
-		MUL_LOHI64(c, val3, lo1, hi1);
-		MUL_LOHI64(d, val4, lo2, hi2);
-		cylo = cy2;
-		cy2  = -(cy2 < 0);	/* Sign extension of cy2 */
-		lo   = lo2 - lo1;
-		cy2 += (lo2 < lo1);	// Borrow out of (lo2 - lo1)?
-		v[i] = cylo - lo;
-		cy2 -= (cylo < lo);
-		cy2 += hi1;
-		cy2 -= hi2;
-	} /* for i */
-
-  #endif
-
-#endif	// YES_ASM ?
-
-#if GCD_DEBUG>=2
-//	printf("cy1 = %lld, cy2 = %lld, cy1|cy2 = %lld\n", cy1, cy2, cy1|cy2);
-#endif
-
-	/*	For GCD main-vector updates we know exit carries are 0 or -1,
-	but to handle eGCD output carries (which may be as large as the difference of
-	two 63-bit ints) need to generalize this to detect any negative 64-bit int: */
-	if(cy1 < 0) {
-	#if GCD_DEBUG
-		printf("cy1 < 0...complementing.\n");
-	#endif
-		for (i = 0; i < len; i++)
-			u[i] = ~u[i];	/* Ones' complement */
-		cy1 = ~cy1;
-	    /* Add -cy1, exit as soon as result is nonzero, indicating that the carryout of the incrementing is 0: */
-		for (i = 0; i < len; i++) {
-			u[i]++;
-			if (u[i] != 0) break;
-		}
-		/* Post-processing negation flips any corresponding sign bit in the return field: */
-		retval ^= 0x00000002;
-	}
-
-	if(cy2 < 0) {
-	#if GCD_DEBUG
-		printf("cy2 < 0...complementing.\n");
-	#endif
-		for (i = 0; i < len; i++)
-			v[i] = ~v[i];
-		cy2 = ~cy2;
-	    /* Add -cy2, exit as soon as result is nonzero, indicating that the carryout of the incrementing is 0: */
-		for (i = 0; i < len; i++) {
-			v[i]++;
-			if (v[i] != 0) break;
-		}
-		retval ^= 0x00000004;
-	}
-
-	/* Does vector length need incrementing? */
-	if(cy1 || cy2) {
-		u[len] = cy1;
-		v[len] = cy2;
-		retval |= 1;
-	}
-
-	return retval;
-
-} /* matrix_vector_product_sub */
-
-/*	Calculates the matrix-vector product
-
-		( u' )   ( a  b ) ( u )
-		(    ) = (      )*(   )
-		( v' )   ( c  d ) ( v ) .
-
-	Here, a, b, c and d are unsigned 64-bit integers having magnitude <= 2^63 and
-	u and v are pointers to base-2^64 multiword integer arrays assumed to contain at most
-	(len) nonzero elements.
-
-	EFFECTS:
-			- Overwrites the input vectors with the result vectors.
-
-	RETURNS:
-			- 1 if either of the linear combinations had a nonzero output carry,
-			  indicating to the caller that the vector length field needs incrementing
-*/
-uint32 matrix_vector_product_add
-(
-	uint64c abmul[],
-	uint64c cdmul[],
-	uint64 *uv_ptr[],
-	uint32  len
-)
-{
-	uint32 i;
-	uint64 cyloA = 0, cyhiA = 0, cyloC = 0, cyhiC = 0;
-	uint64 hi1, hi2, lo1, lo2;
-	sint64c a = abmul[0], b = abmul[1], c = cdmul[0], d = cdmul[1];
-	uint64 Ai, Ci;
-	uint64 *u = uv_ptr[0], *v = uv_ptr[1];
-
-	/* Check magnitudes of a,b,c,d < 2^63: */
-	ASSERT(HERE, a >= 0 && b >= 0 && c >= 0 && d >= 0,"matrix_vector_product_add: a >= 0 && b >= 0 && c >= 0 && d >= 0");
-
-	for (i = 0; i < len; i++)
-	{
-		Ai = u[i];
-		Ci = v[i];
-
-	/* a*u[i] + b*v[i] */
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(a, Ai,&lo1,&hi1);
-		MUL_LOHI64(b, Ci,&lo2,&hi2);
-	#else
-		MUL_LOHI64(a, Ai, lo1, hi1);
-		MUL_LOHI64(b, Ci, lo2, hi2);
-	#endif
-		/*
-			(u[i], cylo, cyhi) <-- (lo1 + lo2 + cylo, hi1 + hi2 + cyhi, 0)	(3-word vector add-with carry)
-		*/
-		/* Low part, with carryin from previous loop execution: */
-		lo1 += cyloA;	cyhiA += lo1 < cyloA;
-		lo1 +=   lo2;	cyhiA += lo1 < lo2;
-		u[i] = lo1;
-
-		/* High part, with carrying from low part: */
-		cyloA  = hi1 + hi2;		/* hi1, hi2 < 2^63, so this can't overflow */
-		cyloA += cyhiA;	cyhiA = cyloA < cyhiA;	/* Carries into the next loop execution */
-
-	/* c*u[i] + d*v[i] */
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(c, Ai,&lo1,&hi1);
-		MUL_LOHI64(d, Ci,&lo2,&hi2);
-	#else
-		MUL_LOHI64(c, Ai, lo1, hi1);
-		MUL_LOHI64(d, Ci, lo2, hi2);
-	#endif
-		/* Low part, with carryin from previous loop execution: */
-		lo1 += cyloC;	cyhiC += lo1 < cyloC;
-		lo1 +=   lo2;	cyhiC += lo1 < lo2;
-		v[i] = lo1;
-
-		/* High part, with carrying from low part: */
-		cyloC  = hi1 + hi2;
-		cyloC += cyhiC;	cyhiC = cyloC < cyhiC;	/* Carries into the next loop execution */
-
-	} /* for i */
-
-	/*	Make sure exit carries are at most one word long: */
-	ASSERT(HERE, (cyhiA+cyhiC == 0),"Carryouts from matrix_vector_product_add(abmul, cdmul, uv_ptr...) out of range!");
-	if(cyloA+cyloC != 0)
-	{
-		u[len] = cyloA;
-		v[len] = cyloC;
-		return 1;
-	}
-	return 0;
-
-} /* matrix_vector_product_add */
-
-
-/*******************/
-
-/*
-!   Unsigned compare of two 192-bit products: a*(xhi*2^64 + xlo) < b*(yhi*2^64 + ylo).
-!   where all operands are treated as unsigned 64-bit integers.
-!   If true, returns 1; otherwise returns zero.
-*/
-int	CMP_LT_PROD192(uint64 a, uint64 xlo, uint64 xhi, uint64 b, uint64 ylo, uint64 yhi)
-{
-	uint192 ax, by;
-	uint64 tmp;
-
-/*	calculate the two products. */
-/*
-!   a*x = a*(xhi*2^64 + xlo) = MULT_HIGH(a*xhi)*2^128 + [ a*xhi + MULT_HIGH(a*xlo) ]*2^64 + a*xlo
-!                            = ax.d2*2^128 + ax.d1*2^64 + ax.d0 is here...
-*/
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(a,xlo,&ax.d0,&  tmp);
-		MUL_LOHI64(a,xhi,&ax.d1,&ax.d2);
-	#else
-		MUL_LOHI64(a,xlo, ax.d0,   tmp);
-		MUL_LOHI64(a,xhi, ax.d1, ax.d2);
-	#endif
-	ax.d1 = ax.d1 + tmp;				/* middle part is here... */
-	ax.d2 = ax.d2 + (ax.d1<tmp);	/* had a carry into high part. */
-/*
-!   b*y = b*(yhi*2^64 + ylo) = MULT_HIGH(b*yhi)*2^128 + [ b*yhi + MULT_HIGH(b*ylo) ]*2^64 + b*ylo
-!                            = bx.d2*2^128 + bx.d1*2^64 + bx.d0 is here...
-*/
-	#ifdef MUL_LOHI64_SUBROUTINE
-		MUL_LOHI64(b,ylo,&by.d0,&  tmp);
-		MUL_LOHI64(b,yhi,&by.d1,&by.d2);
-	#else
-		MUL_LOHI64(b,ylo, by.d0,   tmp);
-		MUL_LOHI64(b,yhi, by.d1, by.d2);
-	#endif
-	by.d1 = by.d1 + tmp;				/* middle part is here... */
-	by.d2 = by.d2 + (by.d1<tmp);	/* had a carry into high part. */
-
-/*	Unsigned compare of products. */
-	return (int)CMPULT192(ax,by);
-}
-
-/*
-!***************
-
-	subroutine mv_dwtvarbase_to_int64(x,p,m,u,ndim)
-!...Move an arbitrary-length integer, stored in balanced-digit, variable-base (DWT) form in an integer*4 array X(1:m),
-!   into row 2 of the integer*8 array U in base-2^64 positive-digit form. The BASE_INDEX array tells us whether a given
-!   digit of X is a DWT bigword or littleword during the conversion.
-!
-	implicit none
-	integer, intent(in) :: ndim,m,p
-	integer :: x(m)
-!...
-	integer :: bits0,i,j,k,slot,word,bw,sw,bjmodn
-	integer :: bit_end,bit_start,bit_sum,excess_bits
-	integer*8 :: u(ndim)
-	integer*8 :: cy1,t1
-
-	bw    = mod(p,m)	!* Number of bigwords in the Crandall/Fagin mixed-radix representation = (Mersenne exponent) mod (vector length).
-	sw    = m - bw		!* Number of smallwords.
-	bits0 = p/m		!* Number of bits in a smallword.
-
-!...make sure the V vector has positive leading digit...
-
-	do j = m,1,-1
-	  if(x(j) /= 0) then
-	    if(x(j) < 0) x = -x	!* flips the sign of the entire vector
-	    EXIT
-	  endif
-	enddo
-
-!...Now move the V vector into row 2 of the 64-bit integer U(:,:) array, converting to a fixed base along the way...
-
-	bit_start = 0
-	i = 1		!* put index is here
-	k = bits0 + 1	!* lowest-order digit is always a bigword.
-
-	do j = 1,m-1
-
-	  cy1 = x(j)			!* this converts to positive-digit form sans branches:
-	  t1 = ISHFT(cy1,-63)		!* If the sign bit = 1, t1 = 1...
-	  x(j+1) = x(j+1) - t1		!* subtract one from next-higher digit...
-	  cy1 = cy1 + ISHFT(t1,k)	!* and add the appropriate base to the current digit.
-
-	  bit_end = bit_start + k
-	  excess_bits = bit_end - 64
-	  if(excess_bits > 0)then
-	    mvbits64(cy1,0,k - excess_bits,u(i),bit_start)
-	    i = i + 1
-	    mvbits64(cy1,k - excess_bits,excess_bits,u(i),0)
-	    bit_start = excess_bits
-	  elseif(excess_bits == 0)then
-	    mvbits64(cy1,0,k,u(i),bit_start)
-	    i = i + 1
-	    bit_start = 0
-	  else
-	    mvbits64(cy1,0,k,u(i),bit_start)
-	    bit_start = bit_end
-	  endif
-
-	  bjmodn = iand(bjmodn + bw,m-1)	!* Can only use this speedy mod for n a power of 2...
-	  k = bits0 + ishft(sw - bjmodn, -31)
-
-	enddo
-
-!...Do the leading digit separately...
-
-	k = bits0		!* leading digit is always a smallword.
-
-	cy1 = x(m)		!* this converts to positive-digit form sans branches:
-	if(btest(cy1,63))then; print*,'FATAL: V-vector has negative leading digit.'; STOP; endif
-
-	bit_end = bit_start + k
-	excess_bits = bit_end - 64
-	if(excess_bits > 0)then
-	  mvbits64(cy1,0,k - excess_bits,u(i),bit_start)
-	  i = i + 1
-	  mvbits64(cy1,k - excess_bits,excess_bits,u(i),0)
-	elseif(excess_bits == 0)then
-	  mvbits64(cy1,0,k,u(i),bit_start)
-	else
-	  mvbits64(cy1,0,k,u(i),bit_start)
-	endif
-
-!...And zero any remaining slots.
-
-	u(i+1:ndim)=0
-
-	end subroutine mv_dwtvarbase_to_int64
-*/
-
-/*
-Various small GCD self-tests.
-*/
-  int test_gcd()
-  {
-	/* Adjust this value to reflect max. vector length desired for self-tests: */
-	#define MAX_ARRAY_DIM	344222	// Set >= 34420 to enable the 2-Mbit test, >= 344222 for the 22-Mbit test.
-	uint32 i,j,kk,cy,cy_fwd, p, lenX, lenY, lenU, lenV, lenZ, sec_len,vec_len, fft_len,pad_len;
-	int fail = 0;
-	uint64 q,qinv,tmp, mod, rem, tmp64 = 0;
-	uint64 *u, *v, *w, *x, *y, *z;
-	double *a, *b;
-	/* Extra stuff needed for eGCD: */
-	uint32 eGCD = FALSE;	// This can be toggled here for the entire self-test suite, or set/unset on a case-by-case basis.
-	uint32 HALF = FALSE;	// Ditto, though note that this one only has an effect if eGCD = TRUE
-	uint64 *Ap = 0x0, *Bp = 0x0, *Cp = 0x0, *Dp = 0x0;
-	uint32 len_AB,sign_AB, len_CD,sign_CD;
-
-	const uint32 num_mword_prime = 20;	// This value must match the # of const uint64 y**[] entries below
-	/* Define precomputed primes of lengths 1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100... words */
-	const uint32 len_mword_prime[] = {1,2,3,4,5,6,7,8,9,10,15,20,30,40,50,60,70,80,90,100,200,300,400,500,600,700,800,900,1000};
-
-	/* Multiword primes of various lengths, least-significant [0]-word leftmost: */
-	const uint64 y1 [ 1] = { 16357897499336320021ull };
-	const uint64 y2 [ 2] = { 6288613973463385207ull, 17450374062388873929ull };
-	const uint64 y3 [ 3] = { 10949365252588722703ull, 1187612066073480231ull, 13552797462131857749ull };
-	const uint64 y4 [ 4] = { 2290166132859908083ull, 15141729687261062388ull, 3353845001322014797ull, 10668737080789349223ull };
-	const uint64 y5 [ 5] = { 1909815749502324189ull, 3409223353484448796ull, 8041232679190714362ull, 18154516404802398221ull, 4919133996046537621ull };
-	const uint64 y6 [ 6] = { 13090340153895430355ull, 13316003867294904363ull, 4086877960619352751ull, 4291335014170841853ull, 11238552196362275798ull, 16072837637891425689ull };
-	const uint64 y7 [ 7] = { 6436712882813530525ull, 3090069294702847100ull, 14604617259536040931ull, 3881736123518103891ull, 12033780901417973631ull, 2121658959755889151ull, 1015222311804066800ull };
-	const uint64 y8 [ 8] = { 14218778164426756345ull, 3881548058709958902ull, 9435344667623458196ull, 15825266020521010528ull, 17646912765656454557ull, 9792466261391312854ull, 13836951808143056624ull, 16061163011110659553ull };
-	const uint64 y9 [ 9] = { 15531849983479353575ull, 13244756845085485741ull, 12235243054937127412ull, 4709695542336918044ull, 11468374613656802212ull, 10406739941509043845ull, 5882424034228837618ull, 1440633758287919928ull, 16966630538648771321ull };
-	const uint64 y10[10] = { 6154942771176964881ull, 14657802710986676243ull, 6367391185176237819ull, 7220279878409007123ull, 17437454528603741845ull, 11135248797733030345ull, 9466072315348543570ull, 16100604701688312144ull, 10143506867524673304ull, 174955034380965ull };
-	const uint64 y15[15] = { 8819938628747633077ull, 5942790028746468569ull, 4937492223042946033ull, 8730501809086437834ull, 1243723839784251790ull, 13716683457145268153ull, 488754166078206799ull, 1949794776773723448ull, 16629452879624053947ull, 1306630474066381588ull, 712163870246867324ull, 4434513443731937057ull, 9865801115224962539ull, 10472193284737100315ull, 17548140925265117600ull };
-	const uint64 y20[20] = { 13716683457145268063ull, 488754166078206799ull, 1949794776773723448ull, 16629452879624053947ull, 1306630474066381588ull, 712163870246867324ull, 4434513443731937057ull, 9865801115224962539ull, 10472193284737100315ull, 17548140925265117600ull, 13850887988965333521ull, 15740887654137791778ull, 12429824020706400576ull, 1975658775442752547ull, 13083483410316005473ull, 2987141948949602865ull, 18052154556276307484ull, 15115700697529442503ull, 7255095742800246455ull, 15953457815171762164ull };
-	const uint64 y30[30] = { 15546513674694818401ull, 6094687342574418678ull, 11123946222831718112ull, 12217883352413216347ull, 15550874203307310822ull, 7206187696399018449ull, 2089253583232126963ull, 11405684275551645807ull, 6595740366593535533ull, 513947309979730528ull, 623238905451609664ull, 14580584265174886892ull, 10164089984630032021ull, 6295726803342317635ull, 4169976721976571503ull, 11210444189919629661ull, 12253764729310025870ull, 15982375818961640458ull, 4835638556759726427ull, 4550010953562141766ull, 4708309259113265585ull, 7528936866787101258ull, 2993339143445663357ull, 4947092101451202277ull, 3470494295975615532ull, 7612382765688613360ull, 1218883132193815401ull, 14455287304819392609ull, 6565258255864629018ull, 446327481124760089ull };
-	const uint64 y40[40] = { 5095253233679814563ull, 9094873052810277402ull, 5502432594497619071ull, 10253180810779792312ull, 11476072751332469357ull, 15370988773499066191ull, 17581225978208792286ull, 526277277651859297ull, 1885992293663728627ull, 10153305797529801974ull, 5464673869670139137ull, 17401163700214643993ull, 17952651913326118589ull, 17742345220822852562ull, 4303343635453243709ull, 4013620749860433596ull, 6677371295215675137ull, 8193725131766686871ull, 7659816303732807063ull, 2282385124936021778ull, 9113451334539710638ull, 10881171535438876472ull, 5476412011083724153ull, 4864713286685264433ull, 5628110023653081947ull, 6926699408022157704ull, 6403858520342783331ull, 6696289090173746018ull, 13288386934025662659ull, 4362860849045370749ull, 16932024427685935439ull, 10744939144246263002ull, 11156351709119530729ull, 5538822441276190107ull, 593146585947091059ull, 16287977467190924709ull, 7931952495909425541ull, 1323052627185110710ull, 13855398606528214565ull, 4866516855423411741ull };
-	const uint64 y50[50] = { 803525447701230777ull, 11995973163883103671ull, 12513807631903294061ull, 13206931885979440508ull, 5781832881550132408ull, 2760760068053793998ull, 7279871441580359859ull, 8868806671540311732ull, 5211038110177683731ull, 12119673326921841832ull, 1820023893508557590ull, 15459246329388416701ull, 12576362623841191959ull, 5774269049365068898ull, 4014785386331757453ull, 16752022808542583240ull, 12646727565723050809ull, 17609658751476230416ull, 4567618588919495722ull, 18229471973955028755ull, 16365526703331694869ull, 6014402465390895934ull, 14379352264227400336ull, 7876171534707826122ull, 1228525137268097609ull, 4019073333129686499ull, 9768820187581486855ull, 15898165664400102772ull, 7343357198210053564ull, 17940883182259865333ull, 2218383524838541976ull, 839057351822990983ull, 15720245341684857544ull, 13837110097306272389ull, 18090746938060205953ull, 17626089812589788026ull, 17795072819199975687ull, 9122504416397444822ull, 7804210406658853893ull, 9085825988092390607ull, 38573679668204362ull, 6489947387271485874ull, 1580408331246418996ull, 843254109280709615ull, 9649575090905754564ull, 2194379072572275786ull, 1300697824291809687ull, 6758258572906841718ull, 6618325607590950226ull, 3084702059426541837ull };
-	const uint64 y60[60] = { 7402034356984044041ull, 18246800267094834509ull, 9355646276203941041ull, 17708280664550799098ull, 9884444230844509075ull, 2445586268038335580ull, 15849763400441455983ull, 16576388534672358617ull, 827691637990772710ull, 11785188128139116957ull, 9456967256883473605ull, 8353050390135495047ull, 9898448512339846253ull, 3671369225156337572ull, 14994373522692786547ull, 16859594160103298137ull, 14510522032204748689ull, 2512816302537297738ull, 15923263995753141351ull, 7428472312605469391ull, 14733812811272604664ull, 2503760744579306444ull, 1243518692364200645ull, 17335828848204797042ull, 15001078003915713534ull, 16234865648979114395ull, 35478654369178396ull, 16879682265742406327ull, 988822640532990652ull, 6441252208187343363ull, 7471324117207928020ull, 9321993331818520759ull, 13530855453747965965ull, 16878852095836032681ull, 999692530134946768ull, 12507060968559488909ull, 1646244627744266941ull, 2863247263353257887ull, 17953355115889683125ull, 10752587360488671818ull, 10532860777350021446ull, 2364427453494076967ull, 7455771926569400730ull, 5195467989032989018ull, 1834988649296150770ull, 12020496442723221343ull, 13846846029514221882ull, 14513446473104247648ull, 17178375772904719250ull, 7603705109499324354ull, 4478326676333763172ull, 11816598820375785035ull, 15738093394210874391ull, 6652442946475616050ull, 11983808028397646472ull, 2973536586709636463ull, 12267436926127621622ull, 12148982840339802509ull, 15447503755701382904ull, 9170693773425904909ull };
-	const uint64 y70[70] = { 13035174564231269173ull, 2031817238368888345ull, 14227779053044384239ull, 15697031109213648585ull, 16246797397621517264ull, 6332083408892383727ull, 9388765936565627601ull, 3055339951133711026ull, 17382999924880859970ull, 12771672880377584848ull, 8364171159194281015ull, 7076312770171612795ull, 2775074124666994714ull, 541377828095402822ull, 10583857457855614412ull, 2657209127254259452ull, 7971412721646259126ull, 15646619613237073924ull, 7193777446646360147ull, 13685152369053955928ull, 3288403155551002076ull, 16539390239681193976ull, 5892710676204249261ull, 7855521822881034680ull, 2976852341030657572ull, 7961678784864755097ull, 3979145662020335758ull, 16271488984939745189ull, 13690797836746321285ull, 11196731569817184326ull, 11885717758943982388ull, 17357160804673989517ull, 13804529479597294534ull, 16043721574971970400ull, 2096844946123222333ull, 9921975875987165637ull, 2768798678716382281ull, 12947331320923744074ull, 1084691708838543443ull, 10749401893225727149ull, 10768375991636447559ull, 3260458100291155455ull, 14413637195894780110ull, 15652838634222062970ull, 4048582062151630608ull, 8946406208257462796ull, 17853047967386028589ull, 11641662773640704651ull, 11303917129069076390ull, 17063452783458751347ull, 2049874958995099249ull, 15546160938745120266ull, 9538043553152286716ull, 18162221324682376697ull, 9628929869110154530ull, 6784665020163418060ull, 1378708342066296474ull, 11536753251347373535ull, 4534979510131407149ull, 17241580948451511362ull, 11374788804146286863ull, 6680542423372366105ull, 18262699497438852104ull, 2580645069957584047ull, 17399724939617397826ull, 13729671085246255165ull, 1262258216099317763ull, 8161435500821964714ull, 2568509089485667314ull, 13586984782852770896ull };
-	const uint64 y80[80] = { 3686879699877952657ull, 453741885935125956ull, 3938341138369497622ull, 16419599976023570750ull, 17910950143366487185ull, 5210269363513904009ull, 6365406617924173421ull, 1137288991988458049ull, 7514907328692273864ull, 6142628874908786840ull, 11723456648662884184ull, 15315897270966342086ull, 12657568862368998877ull, 12168618695858229831ull, 7894411684859967070ull, 15459088869505907538ull, 874797005966515536ull, 8552702139089316709ull, 14992398079764467869ull, 7490232213726620779ull, 2531746272349789760ull, 12917674233239253212ull, 15625330359100556049ull, 9093415458570795535ull, 13538593178636244204ull, 9351204785170562746ull, 14303148925206010133ull, 8876380558379849305ull, 1375113876046730803ull, 6246506863689184382ull, 10848669702049448500ull, 5254421183383954712ull, 12524232939757077924ull, 14519354321980521579ull, 13978127845095476853ull, 2819464761450493294ull, 5706007102882530005ull, 4551246688993488148ull, 11434788483145254823ull, 16173841354543856846ull, 12509315208860406023ull, 4172036203402343042ull, 11799184975285764057ull, 12925189668663022489ull, 5747199812525005113ull, 8764377299440814754ull, 1428887035158948118ull, 2233649227163155683ull, 9403449734926482041ull, 14757393839014274516ull, 8304251627243296163ull, 1937119483778953015ull, 3038821567260813359ull, 12717451046176433525ull, 8660458500420561522ull, 16144546352425647343ull, 14737742462608257290ull, 8697636606013577263ull, 17293841164144319976ull, 15928064021931218237ull, 12115048999295683122ull, 3255001946126728228ull, 8723994184679982995ull, 13439587285723996107ull, 7669185267979994838ull, 17379309529588673134ull, 6846954217803119602ull, 12356564666511368161ull, 9997388457301655286ull, 8583845996545228489ull, 8699953439928823890ull, 15611507079256356466ull, 9868697811108222063ull, 2375318481144124919ull, 12135791452161853891ull, 17277216076912414267ull, 6137863990657693900ull, 6415440756785175233ull, 10505426288612333674ull, 96627997299058663ull };
-	const uint64 y90[90] = { 14327012453873980177ull, 7896631668913294125ull, 10743171059613661959ull, 9254020797600318726ull, 7821195462223301752ull, 12067004296059438889ull, 11451145975088366539ull, 3356748286194447043ull, 5756955915065573524ull, 4967394651906704104ull, 16200443127959440649ull, 748986858458749867ull, 5935610987814150116ull, 7257751516081777223ull, 14632677804351388845ull, 10673271736460498614ull, 10598143576882860980ull, 5342508627593180883ull, 2289255404715875601ull, 18353036279070481307ull, 1462476853186759077ull, 3808265043807978242ull, 9872366095748063543ull, 10012349086639405479ull, 18005086452581661405ull, 16861687126321445049ull, 3294135959910501121ull, 5229794723077146894ull, 3794554444783317603ull, 12152833457765224981ull, 6372607249820535687ull, 3473297593761212178ull, 13771677630026388707ull, 345555404739096981ull, 12722074368935490792ull, 8379379876849799073ull, 10325518729628012158ull, 8698743719780584741ull, 1734303723533305594ull, 10717879912171351068ull, 18242600550810530527ull, 17323713309028296949ull, 552488179587813268ull, 3959798390820063107ull, 11914387527272528608ull, 8992644645129307207ull, 8962962353479785492ull, 3296326283812962026ull, 9706029827934915586ull, 6847717899506044476ull, 9393747518899414823ull, 15192326226324498214ull, 3713534964763546399ull, 9331349013517112684ull, 5279581406580265303ull, 15033055467318031785ull, 18014970481751534816ull, 16545256112998204669ull, 15574852613148839853ull, 3905640428107038313ull, 5804591361607298468ull, 14834959420280064335ull, 6522829411953094466ull, 9334674759583114244ull, 9293846761775796332ull, 16478175502621994545ull, 155420708107757789ull, 10526865276925211750ull, 77227172960405923ull, 15962009290802429017ull, 292405915149433336ull, 10802968607058396556ull, 17598447061487595913ull, 2597092985838943070ull, 5403151017845588797ull, 793021800535760345ull, 12220345118389884222ull, 11504865507696002441ull, 13842125103846679754ull, 14288554951294647296ull, 6646685620512546564ull, 11456722540097699043ull, 6371962759385722204ull, 18037500756150660222ull, 12372696248527741680ull, 17492587105238146773ull, 7029004413759659380ull, 9120104346702250711ull, 6438074115073266034ull, 8763185954741157488ull };
-	const uint64 y100[100] = { 8651761500958765785ull, 18144327741091934514ull, 12270157337756595666ull, 3156099161148169142ull, 4279338241002989180ull, 12083717968964082031ull, 9633317538807782904ull, 4870798020911124321ull, 2981175300705172332ull, 2151188483678590934ull, 10874467985769776816ull, 9382306411753430533ull, 1036028362070479645ull, 715225889416666486ull, 17804187131157907299ull, 13617865426029361250ull, 14751831722551499909ull, 13260903344971084065ull, 1385748392519462027ull, 15263475433688704658ull, 2053361431173867803ull, 8865637984170331070ull, 14288141023742298787ull, 11164273184837158983ull, 7985730982309512025ull, 5702456664721074475ull, 6166119136247968676ull, 11050566233103716601ull, 17496346217083451029ull, 18255904514403111601ull, 12216469643673765517ull, 3204481586097061852ull, 11547311222167575704ull, 2092570891781513960ull, 10130609998198227026ull, 1334745727499238271ull, 11318496886609830379ull, 3746857903549267585ull, 18313219339809391475ull, 17625930300511294323ull, 9367046833890820376ull, 15656591948583642216ull, 14940178300211116794ull, 2126476221790779880ull, 5974469236764481950ull, 6112732136507428712ull, 13721023765697723568ull, 2058945967400097341ull, 71366495660924301ull, 6152375738491761391ull, 10890210021618721414ull, 14177711553690602706ull, 10008455012174366044ull, 6960104434898147378ull, 7195027169060283539ull, 12345827306126428934ull, 10177731113695244691ull, 6733615417560036667ull, 13374622979515710970ull, 12005838724605074293ull, 6824930360888017736ull, 6164640720528174496ull, 1722571809865283122ull, 8542194294112176484ull, 14588769469551781698ull, 13470679260057915358ull, 3544840540517842490ull, 45773415225067640ull, 14389667535480509479ull, 5665297163344087138ull, 11200764405566174782ull, 10255336710292127490ull, 9646756178641762584ull, 1816340593859371465ull, 11953993155776741801ull, 1311299289600998907ull, 17847651011707485923ull, 4862792314258313882ull, 251379236753398197ull, 16605144173203011556ull, 10895953446093503786ull, 9391485119640543730ull, 6675880166957346028ull, 5920536850090467440ull, 8456458013384015881ull, 1313125224932561774ull, 9654843907277435837ull, 3005807009564180859ull, 2367918741490012605ull, 13282919683781319355ull, 3686131585151930643ull, 14798551216059837626ull, 3718487697088598262ull, 1213842626237841938ull, 354332956691164894ull, 2416810242288740439ull, 18361392855425522495ull, 17083443342269785852ull, 17392421406996216228ull, 13453818413721276128ull };
-
-	#define w431	4757395258967641292ull
-	const uint64 y431_test[431] = {
-		15441773537038068193ull, 3295852079737058441ull, 4277869928288117018ull, 7590955667915091630ull,w431,14096952260802720589ull,
-		17612861172509999137ull, 3613444211655844428ull,w431,14921004030550884877ull, 2503926509447010181ull,w431,16357897499336320021ull,
-		w431, 8589934593ull, 17179869187ull, 25769803781ull, 34359738375ull, 42949672969ull,w431,14757395255531667466ull,w431,w431,w431,
-		w431,w431,w431, 3435973836ull,w431,14757395255547920448ull, 59954448353250508ull,w431,14757395255543332928ull, 47569549377981644ull,
-		w431,14757395255535140928ull, 35261719295872204ull,w431,14757395255539597416ull, 38562521921932492ull,w431,14757395255540157512ull,
-		7094523705632279756ull,14757395256418681544ull, 3435973836ull,w431,w431, 1863156813004ull,w431,14757395255531669504ull,
-		243954142412ull,w431,14757395255531667470ull, 304083684556ull,w431,14757395255531667470ull, 46385646796ull,w431,14757395255531667514ull,
-		5347289241946570ull, 817021423881133759ull, 9222773902559490395ull,w431,w431, 7730941132ull, 0ull,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431, 7730941132ull,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,
-		w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431,w431
-	};
-
-	/* Note the revsre digit order here w.r.to the factor.c testFac* inits - that's because for the
-	latter we declare an explicit digit ordering, here we have to accept the default, which is x[0], x[1],...
-	*/
-	const uint128 testp128[] =
-	{
-		{ 1654746039858251761ull,   12240518780192025ull},
-		{  353773459776294223ull, 2923447923687422893ull},
-		{ 7733695761441692271ull,     128099917305337ull},
-		{ 5349936413307099433ull,   10811609908058563ull},
-		{  701890237964104231ull,        700245770430ull},
-		{ 7804835620876695225ull,  756146046438660814ull},
-		{17050154159176967743ull,  106450884062962221ull},
-		{15223106393317212577ull, 7448657723978021346ull},
-		{16055621295463638505ull,     644719741813452ull},
-		{ 4961431981940124743ull,      44696312570505ull},
-		{ 1738540175825943815ull,   99970972632587991ull},
-		{13887741953162944095ull,         67677680549ull},
-		{ 2894679571106043497ull,    5287390011750720ull},
-		{10375766809019373543ull,      11129117045170ull},
-		{ 8321741389535251703ull,       1551337752834ull},
-		{ 6586095673132787791ull,  133834206206032981ull},
-		{ 2460710484528304153ull,       5747037125100ull},
-		{ 7361144750966677159ull,      10824073357153ull},
-		{ 8212830436061989903ull,   32559650929209964ull},
-		{ 4235226679561594903ull,   94004235929829273ull},
-		{ 8676403300852410079ull,        103337218078ull},
-		{14211535226588354713ull,      62897895526806ull},
-		{14854696485656401105ull,          5492917609ull},
-		{10285356664749312993ull,          5799457823ull},
-		{16578386512849109713ull,          4303087381ull},
-		{18263664019678288919ull,          5202063708ull},
-		{ 7607475409566672241ull,          7785579841ull},
-		{ 5630449305759171207ull,          7593776864ull},
-		{ 9058738039473012457ull,         20308449831ull},
-		{ 5034609389988515233ull,         15531431134ull},
-		{ 8291172543411688687ull,         18216394609ull},
-		{15859685870849762975ull,         16259503442ull},
-		{11995354231649723881ull,         20551559047ull},
-		{15122069645900159367ull,         28364424832ull},
-		{ 9636073161914837921ull,         34441477586ull},
-		{ 1304857345634219175ull,         30977655046ull},
-		{ 4788416424359163737ull,         43144178324ull},
-		{ 2258783450670948535ull,         45963786472ull},
-		{15262466751214122975ull,         66325700032ull},
-		{ 3082735332820781609ull,         57210216387ull},
-		{15689518004012743009ull,         80355238912ull},
-		{10819675441336938065ull,        109346652057ull},
-		{10329047311584913071ull,      17534809723250ull},
-		{ 2595613477835803991ull,       1221873279710ull},
-		{ 8612165677489771129ull,      12549422209078ull},
-		{ 9015544550402598895ull,        112416184026ull},
-		{11385509628023387489ull,        142220976614ull},
-		{ 2773697912020767049ull,      14320762091913ull},
-		{ 7515490546312285159ull,       1996508583829ull},
-		{ 2388855518206098663ull,        365842230851ull},
-		{  465403687781705377ull,        261078947686ull},
-		{17079803649869853575ull,     199835753775288ull},
-		{15105677628487752455ull,        202355339943ull},
-		{16692905930976531153ull,      18738454648009ull},
-		{18170931828058363183ull,        412571049040ull},
-		{ 2216600112648316881ull,     534505286298455ull},
-		{0ull,0ull}
-	};
-
-	const uint192 testp192[] =
-	{
-		{15875370168207932041ull,11545660419510266595ull,                  133ull},
-		{  509892144742137431ull,15571349859840161706ull,                 1394ull},
-		{14226674137430228263ull, 4492854135134704005ull,               121320ull},
-		{10174116463236461383ull,14842833464112563611ull,           2649519282ull},
-		{ 7660429456444636239ull,17652352551621896287ull,            655903171ull},
-		{ 2219421057460140527ull,18314245293386716597ull,           1083827012ull},
-		{12343089078196252631ull,18225246095436784582ull,             13161208ull},
-		{ 8097149896429635207ull,14663183769241509326ull,                 4730ull},
-		{17184239148975426263ull,  881920578744577810ull,               215159ull},
-		{17733134473107607967ull, 9900144438119899815ull,            212724356ull},
-		{ 2803405107698253561ull, 5238930328752646394ull,                  261ull},
-		{16346425147370540471ull, 4415476118538293365ull,                    1ull},
-		{ 6167785434693019223ull,11905462972019801043ull,                70130ull},
-		{17951008765075981215ull,18429773635221665090ull,              5800574ull},
-		{15903397166638806257ull,14500669099417213747ull,             22381525ull},
-		{ 3893270457587058239ull, 3291757557782450881ull,                   14ull},
-		{14288981644299514807ull, 1390029428449091172ull,                 1552ull},
-		{ 5085420234315110585ull,14802171160149427175ull,                 2674ull},
-		{ 4949688733053552967ull,14291576310931480037ull,                  664ull},
-		{11405337619840706193ull, 6334326874596939334ull,               617742ull},
-		{  329809049266961143ull,10558642444782195772ull,      157590042578912ull},
-		{17814616685598394119ull, 1933308633079010416ull,        9118322195022ull},
-		{ 3547755741880899889ull,17012949627558354271ull,       70286054459973ull},
-		{16007877010440112335ull, 8040689323464953445ull,   492416983078691417ull},
-		{10050950882119470361ull, 9565712986615012496ull,          59364131986ull},
-		{0ull,0ull,0ull}
-	};
-
-	char char_buf[STR_MAX_LEN];
-	double dbl;
-	const uint64 *ptr_mword_prime[20], *curr_mword_prime;
-	uint64 *out_array = 0x0;
-uint64 k;
-uint192 x192;
-	/*...time-related stuff	*/
-	clock_t clock1, clock2;
-	double tdiff;
-
-	ptr_mword_prime[0] = y1;
-	ptr_mword_prime[1] = y2;
-	ptr_mword_prime[2] = y3;
-	ptr_mword_prime[3] = y4;
-	ptr_mword_prime[4] = y5;
-	ptr_mword_prime[5] = y6;
-	ptr_mword_prime[6] = y7;
-	ptr_mword_prime[7] = y8;
-	ptr_mword_prime[8] = y9;
-	ptr_mword_prime[9] = y10;
-	ptr_mword_prime[10] = y15;
-	ptr_mword_prime[11] = y20;
-	ptr_mword_prime[12] = y30;
-	ptr_mword_prime[13] = y40;
-	ptr_mword_prime[14] = y50;
-	ptr_mword_prime[15] = y60;
-	ptr_mword_prime[16] = y70;
-	ptr_mword_prime[17] = y80;
-	ptr_mword_prime[18] = y90;
-	ptr_mword_prime[19] = y100;
-
-	fprintf(stderr, "INFO: testing GCD routines...\n");
-
-	/* Init the RNG: */
-	rng_isaac_init(TRUE);
-
-	/* Allocate the main data arrays: */
-	sec_len = FFTMUL_THRESHOLD_BITS>>2;		// Input vecs to FFT-gcd must be suitably 0-padded at top:
-//	j = sec_len - (MAX_ARRAY_DIM & (sec_len-1));	// mod-via-AND here assumes sec_len a power of 2
-	j = sec_len;	// Use this simpler form to allow for pure-integer computation of a*u-b*v for check vs FFT result
-	u = (uint64 *)calloc(j+MAX_ARRAY_DIM, sizeof(uint64));
-	v = (uint64 *)calloc(j+MAX_ARRAY_DIM, sizeof(uint64));
-	w = (uint64 *)calloc(2*MAX_ARRAY_DIM, sizeof(uint64));
-	x = (uint64 *)calloc(j+MAX_ARRAY_DIM, sizeof(uint64));
-	y = (uint64 *)calloc(j+MAX_ARRAY_DIM, sizeof(uint64));
-	z = (uint64 *)calloc(2*MAX_ARRAY_DIM, sizeof(uint64));
-	a = (double *)calloc(16*MAX_ARRAY_DIM, sizeof(double));
-	b = (double *)calloc(16*MAX_ARRAY_DIM, sizeof(double));
-//=====================================
-#if 0
-#warning Enable me to resume FFT-GCD debug!
-	vec_len = 20000;	tmp64 = 4788416424359163823ull;// For len 10000 try j = 523 to expose cvt-cy bug
-	for(j = 0; j < 1000; j++) {
-		for(i = 0; i < vec_len-1; i++) {
-			u[i] = rng_isaac_rand();	v[i] = rng_isaac_rand();
-		}
-	if(j < 0) continue;		// ...then stick the offending j into the (j < **) to skip GCD until hit the case in question.
-	fft_gcd_debug = (j == -1);	// Set -1 to specific index you want to do detailed debug on
-	printf("****** Trial %u ******\n",j);	// Initial trial-loop to find smallest j for which test fails...
-	// Fix LSB = 1 to eliminate most common cause of the as-yet-unmultiplied-by-p vecs having GCD > 1, a common power of 2:
-	u[0] |= 1ull;	v[0] |= 1ull;
-	// Now mpy quasirandom 8999-word vecs by small prime GCD:
-	u[vec_len-1] = mi64_mul_scalar(u, tmp64, u, vec_len-1);
-	v[vec_len-1] = mi64_mul_scalar(v, tmp64, v, vec_len-1);
-	// Do the FFT-GCD ... result may be a multiple of the shared small prime, need to account for that:
-	lenX = fft_gcd(u,v, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0);
-	fail += !mi64_is_div_by_scalar64(u, tmp64, lenX);
-	fail += !mi64_is_div_by_scalar64(v, tmp64, lenX);
-	ASSERT(HERE, fail==0,"Failed testcase in FFT-GCD loop!");
-}
-exit(0);
-#endif
-//=====================================
-	// Basic tests of mi64_add/sub:
-	ASSERT(HERE, 1000 < MAX_ARRAY_DIM-2, "MAX_ARRAY_DIM too small!");
-	for(i = 0; i < 1000; ++i) {
-		for(j = 0; j <= i; ++j) {
-			x[j] = u[j] = rng_isaac_rand();
-			y[j] = v[j] = rng_isaac_rand();
-		}
-		z[j] = mi64_add_ref(x,y,z, i+1);
-		w[j] = mi64_add    (u,v,w, i+1);
-		if(!mi64_cmp_eq(w,z, i+2)) {
-			for(j = 0; j <= i+1; ++j) {
-				if(w[j] != z[j]) printf("w[%3d] = %20llu != z[%3d] = %20llu\n",j,w[j],j,z[j]);
-			}
-			ASSERT(HERE, 0, "mi64_add results differ!");
-		}
-	}
-
-	/*
-	Check the uint64[] <==> double[]  interconversion routines:
-	on arrays of increasing size, doubling the size until reach MAX_ARRAY_DIM:
-	*/
-	vec_len = 1;
-	while(vec_len <= MAX_ARRAY_DIM)
-	{
-		/* Init an array of (vec_len) uint64's using quasirandom selections from y4[]: */
-		for(i = 0; i < vec_len; i++) {
-			x[i] = rng_isaac_rand();
-			y[i] = rng_isaac_rand();
-			u[i] = x[i];
-			v[i] = y[i];
-		}
-		ASSERT(HERE, (cy = mi64_cvt_uint64_double(x,y, 0, vec_len, a)) <= 3, "GCD self-test CVT: unexpected carryout of mi64_cvt_uint64_double!");
-		// vec_len is for *each* 64-bit input vec, thus interleaved floating-double ovec has 8*vec_len terms.
-		// Any carryouts of the mi64_cvt_uint64_double() call must be folded back into high words of a[]
-		// prior to back-conversion:
-		if((cy & 1) == 1) {
-			i = 8*vec_len - 2;	i += ( (i >> DAT_BITS) << PAD_BITS );
-			ASSERT(HERE, a[i] < 0, "Unexpected sign of a[i]!");
-			a[i] += FFT_MUL_BASE;
-		}
-		if((cy >> 1) == 1) {
-			i = 8*vec_len - 2;	i += ( (i >> DAT_BITS) << PAD_BITS );
-			ASSERT(HERE, a[i+1] < 0, "Unexpected sign of a[i]!");
-			a[i+1] += FFT_MUL_BASE;
-		}
-		ASSERT(HERE, mi64_cvt_double_uint64(a, 4*vec_len, x,y) == 0, "GCD self-test CVT: unexpected carryout of mi64_cvt_double_uint64!");
-		for(i = 0; i < vec_len; i++) {
-			ASSERT(HERE, x[i] == u[i], "GCD self-test CVT: x[i] != u[i]");
-			ASSERT(HERE, y[i] == v[i], "GCD self-test CVT: y[i] != v[i]");
-		}
-		vec_len *= 2;
-		/* Want to do at least one test right at the max. length: */
-		if(vec_len > MAX_ARRAY_DIM) {
-			if(vec_len >= 2*MAX_ARRAY_DIM)
-				break;
-			else
-				vec_len = MAX_ARRAY_DIM;
-		}
-	}
-
-#if 0	// Set == 1 to enable mi64_add timing test
-	// mi64_add timing test
-	printf	("GCD: Starting mi64_add() timing test loop:\n");
-	vec_len = 1000;
-	for(j = 0; j <= i; ++j) {
-		x[j] = u[j] = rng_isaac_rand();
-		y[j] = v[j] = rng_isaac_rand();
-	}
-	clock1 = clock();
-	for(i = 0; i < 1000000; i++) {
-		ASSERT(HERE, mi64_add(x,y,z, vec_len) < 2, "GCD self-test mi64_add!");
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf	("GCD: mi64_add() self-test passed: %u*%u words processed, Time =%s\n",i,vec_len,get_time_str(tdiff));
-	exit(0);
-#endif
-
-// 8/08/2012: Track down mi64_div bug which leaves remainder with 1*divisor needing subtraction:
-	// x= 184027369222821018387581692039:
-	x[1] = 9976143675ull;	x[0] =  7539741246537263239ull;
-	y[1] =         10ull;	y[0] = 10733642766940458262ull;
-	mi64_div(x,y,2,2,u,v);	// quo in u[], rem in v[]
-	ASSERT(HERE, u[1]==0 && u[0]==942757929 && v[1]==0 && v[0]==1, "divisor check fails!");
-
-// 6/08/2012: Trial-factor of mm127 residue check for Paul L:
-#if 0
-	p = 127;	lenX = (p>>6);
-	memset(x,0xff,(lenX<<3));	x[lenX++] = (1ull << (p&63)) - 1;	// x has p
-	k = 20;	// Really testing k = 10 ... absorb the 2* part of q = 2.k.p+1 into this constant
-	y[lenX] = mi64_mul_scalar(x, k, y, lenX); ++y[0];	++lenX;		// y = q; Can add 1 sans carry check since 2.k.p even
-	x192 = twopmodq192(*(uint192*)x,*(uint192*)y);
-	printf("Res127 = %s\n", &string0[convert_mi64_base10_char(string0, (uint64*)&x192, lenX, 0)]);
-exit(0);
-#endif
-
-/**** 7/24/2012: Collect stats on the 64-bit mod-inverses: Results summarized in http://www.mersenneforum.org/showpost.php?p=305815&postcount=35 ****/
-#if 0
-	for(j = 0; (q = testp128[j].d0) != 0; ++j)
-	{
-		q = (q & 0xFFFFFFFFFFFFFFF0) + 0xf;	// Control q (mod 16) here; 1/3/5/7/9/b/d/f give >= 6/5/6/5/5/6/5/6 good bits in qinv_0, resp.
-		qinv = (q+q+q) ^ (uint64)2;
-		printf("q = %16llX, qinv_0 = %16llX\n",q,qinv);
-		for(i = 0; i < 4; i++)
-		{
-			tmp = q*qinv;
-			qinv = qinv*((uint64)2 - tmp);
-			printf("i = %1u, tmp = %16llX, qinv_%1u = %16llX\n",i+1,tmp,i+1,qinv);
-		}
-		printf("\n");
-	}
-	exit(0);
-	/*
-	Outputs:
-	q = 16F6D6C18B3C47F1, qinv_0 = 44E48444A1B4D7D1	i = 1, tmp = EE480F97731622C1, qinv_1 = 124D858CE6733911	i = 2, tmp = E4139C5D82487001, qinv_2 = BC53762ACEB3C911	i = 3, tmp = 2C90F9C0CF000001, qinv_3 = 3437C0D60FB3C911	i = 4, tmp = D89F000000000001, qinv_4 = FAA8C0D60FB3C911
-	q =  4E8DB1658ED4D4F, qinv_0 =  EBA91430AC7E7EF	i = 1, tmp =  D39144454B675C1, qinv_1 = 36C7715847EFB9AF	i = 2, tmp = 712332AE5CD6F001, qinv_2 = 69BD01D56D91A9AF	i = 3, tmp = 14668F09DF000001, qinv_3 = FFA966DEFC91A9AF	i = 4, tmp = 8FBF000000000001, qinv_4 = A51866DEFC91A9AF
-	q = 6B539B51F5AD1A6F, qinv_0 = 41FAD1F5E1074F4F	i = 1, tmp = A48B004D839C6941, qinv_1 = C414C264DE88148F	i = 2, tmp = 44BF3C380EBA7001, qinv_2 =  84C7DC293A3848F	i = 3, tmp =  91613F90F000001, qinv_3 = 79795CE732A3848F	i = 4, tmp = D11F000000000001, qinv_4 = AD285CE732A3848F
-	q = 4A3ECAD29E18D929, qinv_0 = DEBC6077DA4A8B79	i = 1, tmp =  269678EE781E761, qinv_1 = 6095BE0443830F19	i = 2, tmp = 85986005C0219C01, qinv_2 =  48B7A2BB416D319	i = 3, tmp = 797D799668F00001, qinv_3 = 870DDD5DA4A6D319	i = 4, tmp = FBBC1F0000000001, qinv_4 = CA21D65DA4A6D319
-	q =  9BD9D73E12A3227, qinv_0 = 1D38D85BA37E9677	i = 1, tmp = F0A5835CE8322A21, qinv_1 = C15E253B33BE4197	i = 2, tmp = A4FA02D184917C01, qinv_2 =  5D2554F09721D97	i = 3, tmp = CFDD2D724BF00001, qinv_3 = FB95536A0EE21D97	i = 4, tmp = E7B97F0000000001, qinv_4 = 2BC86A6A0EE21D97
-	q = 6C5058A538ABB6B9, qinv_0 = 44F109EFAA032429	i = 1, tmp = 80CC377F375D47A1, qinv_1 = E5F22CBE97202B89	i = 2, tmp = C9B2F87C21B5DC01, qinv_2 = 5F3E952780D96F89	i = 3, tmp =  9227C172AF00001, qinv_3 = 8850BBB376696F89	i = 4, tmp = C1AC5F0000000001, qinv_4 = 2EE0E4B376696F89
-	q = EC9E50DF4764663F, qinv_0 = C5DAF29DD62D32BF	i = 1, tmp = 90B7AF4982F39701, qinv_1 = 7F0D8F2603F189BF	i = 2, tmp = 23A594DBFCEF0001, qinv_2 = 864FDAA966A089BF	i = 3, tmp = ED2E98DF00000001, qinv_3 = 3B5C754866A089BF	i = 4, tmp =                1, qinv_4 = 3B5C754866A089BF
-	q = D343569BF7A681A1, qinv_0 = 79CA03D3E6F384E1	i = 1, tmp = 312EF0FD3601F281, qinv_1 = 69F342363EB36261	i = 2, tmp = 6811DCAE3549C001, qinv_2 = 24EE54BC9241A261	i = 3, tmp = 42A76140F0000001, qinv_3 = FD9A7341A241A261	i = 4, tmp = 1F00000000000001, qinv_4 = 3E9A7341A241A261
-	q = DED106807C4EA5E9, qinv_0 = 9C73138174EBF1B9	i = 1, tmp = FA0F4CA8BFE93E61, qinv_1 = 211267246F857E59	i = 2, tmp = 41B7426A260D5C01, qinv_2 = 5C2191ADD5988259	i = 3, tmp =  BD7D7FD86F00001, qinv_3 = 526AED840C288259	i = 4, tmp = E27FDF0000000001, qinv_4 = CCB866840C288259
-	q = 44DA8C19CCCDAC47, qinv_0 = CE8FA44D666904D7	i = 1, tmp = B5CE51392A8BCBA1, qinv_1 = F4BFA30837328177	i = 2, tmp = 9A51E798BE48DC01, qinv_2 = 6BE751A614783D77	i = 3, tmp = C70284B382F00001, qinv_3 = 3E1CBB8106E83D77	i = 4, tmp = 21675F0000000001, qinv_4 = FD6C928106E83D77
-	q = 1820891427D9BD07, qinv_0 = 48619B3C778D3717	i = 1, tmp = CB2259CEBA077CA1, qinv_1 = BCD4192420FAA4B7	i = 2, tmp =  59D4D2F72949C01, qinv_2 = D5251E31ACCF20B7	i = 3, tmp = FC4E06CB40F00001, qinv_3 = 1C9A18B8413F20B7	i = 4, tmp = B2E71F0000000001, qinv_4 = B482EFB8413F20B7
-	q = C0BB2BB9DA911E5F, qinv_0 = 4231832D8FB35B1F	i = 1, tmp = 3F7C33555ACB7281, qinv_1 = DDA732DAAFDCFD9F	i = 2, tmp = 991348CF35C9C001, qinv_2 =  E938E5EE4CEBD9F	i = 3, tmp = FC6E5780F0000001, qinv_3 = 6FEFE419D4CEBD9F	i = 4, tmp = 1F00000000000001, qinv_4 = 2EEFE419D4CEBD9F
-	q = 282BF7BBB65B1669, qinv_0 = 7883E73323114339	i = 1, tmp = C61F0F8D491E7861, qinv_1 = 3855D5FFC36755D9	i = 2, tmp = 2631C1F8D0E5DC01, qinv_2 = 5F8385AADA83D9D9	i = 3, tmp = CA776C1CAAF00001, qinv_3 = 9608BBA88513D9D9	i = 4, tmp = A95C5F0000000001, qinv_4 = AD3534A88513D9D9
-	q = 8FFE20F08BE357E7, qinv_0 = AFFA62D1A3AA07B7	i = 1, tmp = E735F74EAB512721, qinv_1 = 99D5D1F899552FD7	i = 2, tmp = 9B26E48177C53C01, qinv_2 = EF985EE6E0ABCBD7	i = 3, tmp = F050924299F00001, qinv_3 = D123E596481BCBD7	i = 4, tmp = 83AF3F0000000001, qinv_4 = 9D00FC96481BCBD7
-	q = 737CC3B40BEA0CF7, qinv_0 = 5A764B1C23BE26E7	i = 1, tmp = 4722F0BAE2705CE1, qinv_1 = 83CA2045945118C7	i = 2, tmp = 92CA75331A4E3C01, qinv_2 = 967581A3CFE074C7	i = 3, tmp = 4F481BE761F00001, qinv_3 = 7A4C3665EE5074C7	i = 4, tmp = 7B683F0000000001, qinv_4 = 9FB73D65EE5074C7
-	q = 5B66831EBDD1284F, qinv_0 = 1233895C397378EF	i = 1, tmp = DCB76CFFF3A6A9C1, qinv_1 =  1A57B87A008FEAF	i = 2, tmp = 625A00BCEA70F001, qinv_2 = CC17103FCEB4EEAF	i = 3, tmp = 136CD76D1F000001, qinv_3 = 6BBC8DD59DB4EEAF	i = 4, tmp = 963F000000000001, qinv_4 = 24AB8DD59DB4EEAF
-	q = 22263325F5639019, qinv_0 = 66729971E02AB049	i = 1, tmp = 8E339AFBF08F4721, qinv_1 = 425AD75A1B296829	i = 2, tmp = 2CAB11B3767D3C01, qinv_2 = EB1BD216A4BACC29	i = 3, tmp = 4C0E676C59F00001, qinv_3 = 196BD1B0FD4ACC29	i = 4, tmp = DDE73F0000000001, qinv_4 = 132EBAB0FD4ACC29
-	q = 66280A2665999EA7, qinv_0 = 32781E7330CCDBF7	i = 1, tmp = BBD83F252F04F021, qinv_1 = 2A005F04401DCD17	i = 2, tmp = FBC0067BDDC3FC01, qinv_2 = E1818B19D6B62917	i = 3, tmp = F5BFE0DE1FF00001, qinv_3 = 17336F6788262917	i = 4, tmp = 97C3FF0000000001, qinv_4 = 36C0866788262917
-	q = 71F9D5C4A5FDCC0F, qinv_0 = 55ED814DF1F9642F	i = 1, tmp = 46F5606A36E552C1, qinv_1 = 42CD64A80CCC32EF	i = 2, tmp = 1CC1ECA9D9C07001, qinv_2 = 39F03AF4EC43A2EF	i = 3, tmp = 18345177CF000001, qinv_3 = 745B2E1CAB43A2EF	i = 4, tmp = E69F000000000001, qinv_4 = 87EA2E1CAB43A2EF
-	q = 3AC68C39D286A817, qinv_0 = B053A4AD7793F847	i = 1, tmp = E31CB6FC0C63E661, qinv_1 = 9562FC71579213A7	i = 2, tmp = EA286E6B026F5C01, qinv_2 = 943E2BAE05190FA7	i = 3, tmp = 83D32A1F16F00001, qinv_3 = 9669F19DFE890FA7	i = 4, tmp = AFD1DF0000000001, qinv_4 = 4170789DFE890FA7
-	q = 7868C6D52355A6DF, qinv_0 = 693A547F6A00F49F	i = 1, tmp = CEFD0FC7EB3F3081, qinv_1 = F257C1871387D51F	i = 2, tmp = 26E6865C17CFC001, qinv_2 = 4FFFA2C2169F951F	i = 3, tmp = 637702E7F0000001, qinv_3 = 633739FC069F951F	i = 4, tmp = FF00000000000001, qinv_4 = 823739FC069F951F
-	q = C53983FE1DBFD899, qinv_0 = 4FAC8BFA593F89C9	i = 1, tmp = 6B6D3D5E8531F121, qinv_1 = 784CCA1D941F17A9	i = 2, tmp =  7DCBE368EA2BC01, qinv_2 = 7EA3AAE407CCFBA9	i = 3, tmp = D4CFBFFD85F00001, qinv_3 = B9D00EF44C5CFBA9	i = 4, tmp = DD8CBF0000000001, qinv_4 = D4A0F7F44C5CFBA9
-	q = CE267BC009CB14D1, qinv_0 = 6A7373401D613E71	i = 1, tmp = 86D6F0B211DFCE41, qinv_1 = 4F6EA4279F63B431	i = 2, tmp = ADCDB1F28654F001, qinv_2 = F707CA5F6061C431	i = 3, tmp = ED0D38919F000001, qinv_3 = 6C6339C3F161C431	i = 4, tmp = 7F3F000000000001, qinv_4 = D55439C3F161C431
-	q = 8EBCED6476944BE1, qinv_0 = AC36C82D63BCE3A1	i = 1, tmp = 5BC2EE20CEC83B81, qinv_1 = 38BA2EA1990CF821	i = 2, tmp = 2A4843A0FA2BC001, qinv_2 = B0D1E4B9F7693821	i = 3, tmp = 6E568585F0000001, qinv_3 = 1BF5F1F607693821	i = 4, tmp = BF00000000000001, qinv_4 = 7CF5F1F607693821
-	q = E61242B2877F82D1, qinv_0 = B236C817967E8871	i = 1, tmp = 657FB8F965A5C641, qinv_1 = E7FBAA8093800631	i = 2, tmp = 9AAC1DB857F8F001, qinv_2 = F1CFB9DFE73A1631	i = 3, tmp = 858BDACE1F000001, qinv_3 = CB911BC1F83A1631	i = 4, tmp = 183F000000000001, qinv_4 = BD821BC1F83A1631
-	q = FD7591A92E01DC17, qinv_0 = F860B4FB8A059447	i = 1, tmp = 8A1F11BF88345661, qinv_1 = 5F49FE6911B21FA7	i = 2, tmp = 1AB6B646CBDB5C01, qinv_2 = E30277E88D751BA7	i = 3, tmp = 2AE4102176F00001, qinv_3 = B33ADED8A6E51BA7	i = 4, tmp = 1EDDDF0000000001, qinv_4 = AEF965D8A6E51BA7
-	q = 69932E9430BDA571, qinv_0 = 3CB98BBC9238F051	i = 1, tmp = E6365667D9D348C1, qinv_1 = 112B05F5282AEB91	i = 2, tmp = 58C295C97ED37001, qinv_2 = CB0A2F37DA987B91	i = 3, tmp = B8C54F1E2F000001, qinv_3 = 8F52F68A3B987B91	i = 4, tmp = F35F000000000001, qinv_4 = 1183F68A3B987B91
-	q = 4E235FCFEB9EC287, qinv_0 = EA6A1F6FC2DC4797	i = 1, tmp = 22F7AB5F7C9C2EA1, qinv_1 = 77B185A0A04E6737	i = 2, tmp = 2863F8CC24821C01, qinv_2 = 663AFDEFE7166337	i = 3, tmp =  D2E85FF8CF00001, qinv_3 = B337B3E7CF866337	i = 4, tmp = 8C489F0000000001, qinv_4 = 40208AE7CF866337
-	q = 7DB71A406C1B7EE9, qinv_0 = 79254EC144527CB9	i = 1, tmp = 6975C718FBF99261, qinv_1 = 4408B873E7513559	i = 2, tmp = 3BDFF5A9AD8E5C01, qinv_2 = 15F17C78FBC73959	i = 3, tmp = A048D07DCEF00001, qinv_3 = 7A03D1189A573959	i = 4, tmp = 4258DF0000000001, qinv_4 = D6774A189A573959
-	q = 45DE8690D11C8DA1, qinv_0 = D19B93B27355A8E1	i = 1, tmp = 29917D8EE07F2281, qinv_1 = 72575215C0745661	i = 2, tmp = F165FE7CC059C001, qinv_2 =  F8CC387B7F29661	i = 3, tmp = 7F874088F0000001, qinv_3 =  87A3304C7F29661	i = 4, tmp = 1F00000000000001, qinv_4 = 497A3304C7F29661
-	q = 7310297FED8694EF, qinv_0 = 59307C7FC893BECF	i = 1, tmp = D89CAFEE2F98CF41, qinv_1 =  FC4EC2AE072AA0F	i = 2, tmp = F5ABADAFBC377001, qinv_2 = D014593228D31A0F	i = 3, tmp = 9473677EAF000001, qinv_3 = 1E0C2DFFE7D31A0F	i = 4, tmp = 445F000000000001, qinv_4 = 767B2DFFE7D31A0F
-	q = DC18EC457548269F, qinv_0 = 944AC4D05FD873DF	i = 1, tmp = AB20DC962A5B1181, qinv_1 = B4C368A54DA7B55F	i = 2, tmp = 70317BDC8DCDC001, qinv_2 = B6AF5D91F58D755F	i = 3, tmp = C151B522F0000001, qinv_3 = 7DA5FDEAE58D755F	i = 4, tmp = 5F00000000000001, qinv_4 = 3CA5FDEAE58D755F
-	q = A6780F1E9C75E5E9, qinv_0 = F3682D5BD561B1B9	i = 1, tmp = 6AF927168F723E61, qinv_1 = 42DAE090AC523E59	i = 2, tmp = 623B32D5234D5C01, qinv_2 = 65F58D5332254259	i = 3, tmp = 9C76A97786F00001, qinv_3 = A292714B68B54259	i = 4, tmp = 65BFDF0000000001, qinv_4 = 205FEA4B68B54259
-	q = D1DC623908989987, qinv_0 = 759526AB19C9CC97	i = 1, tmp = 8B2DA079665922A1, qinv_1 = 47ACD042E3FEE037	i = 2, tmp = BF4A1E1F68111C01, qinv_2 =  CCB37A6C7D1DC37	i = 3, tmp = 6EFD501B44F00001, qinv_3 = A70DBA9CB841DC37	i = 4, tmp = 62CF9F0000000001, qinv_4 = A0CE919CB841DC37
-	q = 85BA357C959BEBA1, qinv_0 = 912EA075C0D3C2E1	i = 1, tmp = 868AFCB09C4D1A81, qinv_1 = 5FF0EEDD1B7A7861	i = 2, tmp = 5861807B0C41C001, qinv_2 = 44940BE92490B861	i = 3, tmp = F83B451CF0000001, qinv_3 = 3A6B0F723490B861	i = 4, tmp = 9F00000000000001, qinv_4 = FB6B0F723490B861
-	q = 121BC8D7A91634A7, qinv_0 = 36535A86FB429DF7	i = 1, tmp = 59A4AD8A12C53821, qinv_1 = 80AB672C134DD717	i = 2, tmp =  B23E9AC1271FC01, qinv_2 = ED16933BE46C3317	i = 3, tmp = D0C925CF8FF00001, qinv_3 = 4C695AA925DC3317	i = 4, tmp = 88F1FF0000000001, qinv_4 = A4DE71A925DC3317
-	q = 4273DF5E2A92F759, qinv_0 = C75B9E1A7FB8E609	i = 1, tmp = D6C803F0785CA821, qinv_1 = 59B021424E6A3CE9	i = 2, tmp = BBBD04D8BA95FC01, qinv_2 = 72C5FF62FCD7E0E9	i = 3, tmp = D30875F0AFF00001, qinv_3 = 94AC0AD0DB67E0E9	i = 4, tmp = 4515FF0000000001, qinv_4 = DB86F3D0DB67E0E9
-	q = 1F58CF94B1C5E0B7, qinv_0 = 5E0A6EBE1551A227	i = 1, tmp = CB169D036A4009E1, qinv_1 = 25EAE6A9DF706107	i = 2, tmp = D7C9EC8D8F9E7C01, qinv_2 = 63B20C05A51EFD07	i = 3, tmp = 8E38F75AB3F00001, qinv_3 = FD63DE96898EFD07	i = 4, tmp = DCC67F0000000001, qinv_4 = 56736596898EFD07
-	q = D3CF2CA56E53F7DF, qinv_0 = 7B6D85F04AFBE79F	i = 1, tmp = D47FAF57C8762C81, qinv_1 = 377B7BCD85EEC41F	i = 2, tmp = 6ADDE4EAF243C001, qinv_2 = B38DB421D0BA841F	i = 3, tmp =  8E0D711F0000001, qinv_3 = EB660935C0BA841F	i = 4, tmp = 3F00000000000001, qinv_4 = 4A660935C0BA841F
-	q = 2AC81373C1540A29, qinv_0 = 80583A5B43FC1E79	i = 1, tmp = DDA8F653F5459B61, qinv_1 = 1CD5529F1B606E19	i = 2, tmp = 9542716D1FF29C01, qinv_2 = 97A9E5CE3DA73219	i = 3, tmp = 29B2804CB0F00001, qinv_3 = 9EDBC73216373219	i = 4, tmp = 77351F0000000001, qinv_4 = F19DC03216373219
-	q = D9BC5D7F7E467961, qinv_0 = 8D35187E7AD36C21	i = 1, tmp = AB225661473D9181, qinv_1 = FDC43FD06601AAA1	i = 2, tmp = EBC2877C564DC001, qinv_2 = 37292880BD9BEAA1	i = 3, tmp = 4715AB62F0000001, qinv_3 = E3C89FE7CD9BEAA1	i = 4, tmp = 5F00000000000001, qinv_4 = 24C89FE7CD9BEAA1
-	q = 9627357D21E7EE51, qinv_0 = C275A07765B7CAF1	i = 1, tmp = EAB93D063E4A4441, qinv_1 = 7974B2AC1E330AB1	i = 2, tmp = FC563E6578CDF001, qinv_2 = 35C724A4BC701AB1	i = 3, tmp = 743EE955BF000001, qinv_3 =  8CD8AF5AD701AB1	i = 4, tmp = 9B7F000000000001, qinv_4 = 9FFE8AF5AD701AB1
-	q = 8F5825CDE338E6AF, qinv_0 = AE087169A9AAB40F	i = 1, tmp = DE1A4BAE09BE9041, qinv_1 = E870F3570F93404F	i = 2, tmp = CB0F75010FB7F001, qinv_2 = 3191366669D0304F	i = 3, tmp = 9E390DB6FF000001, qinv_3 = DE1D7B1DB8D0304F	i = 4, tmp = 6DFF000000000001, qinv_4 = 1C6C7B1DB8D0304F
-	q = 240578B4B89E8D57, qinv_0 = 6C106A1E29DBA807	i = 1, tmp = 9A12D090F183F561, qinv_1 = 72F05839AD18F267	i = 2, tmp = F7763BFC348F1C01, qinv_2 = E902ECBA9F0CAE67	i = 3, tmp = B48E6C9FB4F00001, qinv_3 = 76A3203DB27CAE67	i = 4, tmp = 3DFD9F0000000001, qinv_4 = 1F86273DB27CAE67
-	q = 77848F0DF196D279, qinv_0 = 668DAD29D4C47769	i = 1, tmp = 76A4802B8B5692A1, qinv_1 = C0B2E233AD99F3C9	i = 2, tmp = F5739DBF68851C01, qinv_2 = DBEB573A8782F7C9	i = 3, tmp = 725BCC09E4F00001, qinv_3 = 3CD9C5B23712F7C9	i = 4, tmp = 9A639F0000000001, qinv_4 = B838EEB23712F7C9
-	q = 7D1DA6008F74A7EF, qinv_0 = 7758F201AE5DF7CF	i = 1, tmp = 1F35C252012E6341, qinv_1 = 4ECAD9FF69D8F70F	i = 2, tmp = 575A05D1AE857001, qinv_2 = 188CA7CCF0F7670F	i = 3, tmp = 9CC576326F000001, qinv_3 = AFB3572F6FF7670F	i = 4, tmp = 73DF000000000001, qinv_4 = 2CA2572F6FF7670F
-	q = 9E0174AF051FB561, qinv_0 = DA045E0D0F5F2021	i = 1, tmp = 71BF81859DC18181, qinv_1 = 44EF8335FABD6EA1	i = 2, tmp = 723585E47B7DC001, qinv_2 = 4D2D681D8827AEA1	i = 3, tmp = A2EFEBBAF0000001, qinv_3 = 5916886C9827AEA1	i = 4, tmp = 5F00000000000001, qinv_4 = 9A16886C9827AEA1
-	q = 267E278E283D9149, qinv_0 = 737A76AA78B8B3D9	i = 1, tmp =  945D677CD3E31E1, qinv_1 = 18F6DBC62920CCF9	i = 2, tmp = ED7E76C30DC87C01, qinv_2 = E9972C267E5030F9	i = 3, tmp = 1D851E6603F00001, qinv_3 = B1F81F2FA9E030F9	i = 4, tmp = 9CB07F0000000001, qinv_4 = 127C982FA9E030F9
-	q = 684C62D545C00BE7, qinv_0 = 38E5287FD14023B7	i = 1, tmp = 9D260851AEA91721, qinv_1 = 2A6EDC33F0373BD7	i = 2, tmp = 52E312A0F5A93C01, qinv_2 = 2944BACBBA41D7D7	i = 3, tmp = CC47A347B9F00001, qinv_3 =  B5AC67601B1D7D7	i = 4, tmp = B5D33F0000000001, qinv_4 = 5E07DD7601B1D7D7
-	q = 2126EB6FE667FCE7, qinv_0 = 6374C24FB337F6B7	i = 1, tmp = C19F23DB7BFCC321, qinv_1 = 490560A7E227BAD7	i = 2, tmp = E2628A2E84463C01, qinv_2 = A5B03BD09F9356D7	i = 3, tmp = 89B9EEDB21F00001, qinv_3 = 5F6277907F0356D7	i = 4, tmp = AAE03F0000000001, qinv_4 = CAE38E907F0356D7
-	q =  67572302F62A6A1, qinv_0 = 136056908E27F3E1	i = 1, tmp = 8314DA58FA664681, qinv_1 = 4F206FFDA7D87D61	i = 2, tmp = B0B19D71BE95C001, qinv_2 = 4E67FF3B525ABD61	i = 3, tmp =  469DF66F0000001, qinv_3 = A54AFA0A625ABD61	i = 4, tmp = DF00000000000001, qinv_4 = 264AFA0A625ABD61
-	q = ED07A6ED47E9B387, qinv_0 = C716F4C7D7BD1A97	i = 1, tmp = 1F6C268412BF9AA1, qinv_1 =  B105E47CAE4A637	i = 2, tmp = 902189D967DB1C01, qinv_2 =   D89FFD76A9A237	i = 3, tmp =  8707BEF14F00001, qinv_3 = 3C9541F01719A237	i = 4, tmp = 46299F0000000001, qinv_4 = 4C0618F01719A237
-	q = D1A225C51193DB07, qinv_0 = 74E6714F34BB9117	i = 1, tmp = F0149150BD74A4A1, qinv_1 = 54DF24BA5D2226B7	i = 2, tmp = 2F666BC625229C01, qinv_2 = 35C4D74DF63CA2B7	i = 3, tmp = 4AABAA3A30F00001, qinv_3 = 1A7B3C7D1AACA2B7	i = 4, tmp =  9E51F0000000001, qinv_4 = 3614137D1AACA2B7
-	q = E7A91D805CDE12D1, qinv_0 = B6FB5881169A3871	i = 1, tmp = 3297607FD9DE0641, qinv_1 = 7FA47E1017BB7631	i = 2, tmp = 381BA558A8D8F001, qinv_2 = 3185263C77958631	i = 3, tmp = 361D8D2A1F000001, qinv_3 = 5C0C09F288958631	i = 4, tmp = D03F000000000001, qinv_4 = 85FD09F288958631
-	q = FC2C1E395142152F, qinv_0 = F4845AABF3C63F8F	i = 1, tmp = BB5E3CA0A67A6641, qinv_1 = C2F9357D00BD61CF	i = 2, tmp = 83CF38DF6228F001, qinv_2 = D544EC71EEB351CF	i = 3, tmp = CE0A81B41F000001, qinv_3 = 12F860FDDDB351CF	i = 4, tmp = 643F000000000001, qinv_4 = 150760FDDDB351CF
-	q = 1EC2F20EF37E67D1, qinv_0 = 5C48D62CDA7B3771	i = 1, tmp = 51E13EF97E84BA41, qinv_1 =  7361C7D81214131	i = 2, tmp = 748FC358667EF001, qinv_2 = EED1E93EB7E55131	i = 3, tmp = DE1199CEDF000001, qinv_3 = 6252891708E55131	i = 4, tmp = 59BF000000000001, qinv_4 = C5C3891708E55131
-	
-	Now focus just on the final (4th) iteration, sort by #good bits in tmp:
-	q = 66280A2665999EA7	i = 4, tmp = 97C3FF0000000001, qinv_4 = 36C0866788262917
-	q = 4273DF5E2A92F759	i = 4, tmp = 4515FF0000000001, qinv_4 = DB86F3D0DB67E0E9
-	q = 121BC8D7A91634A7	i = 4, tmp = 88F1FF0000000001, qinv_4 = A4DE71A925DC3317
-	q = 3AC68C39D286A817	i = 4, tmp = AFD1DF0000000001, qinv_4 = 4170789DFE890FA7
-	q = 7DB71A406C1B7EE9	i = 4, tmp = 4258DF0000000001, qinv_4 = D6774A189A573959
-	q = A6780F1E9C75E5E9	i = 4, tmp = 65BFDF0000000001, qinv_4 = 205FEA4B68B54259
-	q = DED106807C4EA5E9	i = 4, tmp = E27FDF0000000001, qinv_4 = CCB866840C288259
-	q = FD7591A92E01DC17	i = 4, tmp = 1EDDDF0000000001, qinv_4 = AEF965D8A6E51BA7
-	q = C53983FE1DBFD899	i = 4, tmp = DD8CBF0000000001, qinv_4 = D4A0F7F44C5CFBA9
-	q = 4E235FCFEB9EC287	i = 4, tmp = 8C489F0000000001, qinv_4 = 40208AE7CF866337
-	q = D1DC623908989987	i = 4, tmp = 62CF9F0000000001, qinv_4 = A0CE919CB841DC37
-	q = ED07A6ED47E9B387	i = 4, tmp = 46299F0000000001, qinv_4 = 4C0618F01719A237
-	q = 77848F0DF196D279	i = 4, tmp = 9A639F0000000001, qinv_4 = B838EEB23712F7C9
-	q = 240578B4B89E8D57	i = 4, tmp = 3DFD9F0000000001, qinv_4 = 1F86273DB27CAE67
-	q = 1F58CF94B1C5E0B7	i = 4, tmp = DCC67F0000000001, qinv_4 = 56736596898EFD07
-	q =  9BD9D73E12A3227	i = 4, tmp = E7B97F0000000001, qinv_4 = 2BC86A6A0EE21D97
-	q = 267E278E283D9149	i = 4, tmp = 9CB07F0000000001, qinv_4 = 127C982FA9E030F9
-	q = 6C5058A538ABB6B9	i = 4, tmp = C1AC5F0000000001, qinv_4 = 2EE0E4B376696F89
-	q = 44DA8C19CCCDAC47	i = 4, tmp = 21675F0000000001, qinv_4 = FD6C928106E83D77
-	q = 282BF7BBB65B1669	i = 4, tmp = A95C5F0000000001, qinv_4 = AD3534A88513D9D9
-	q = 8FFE20F08BE357E7	i = 4, tmp = 83AF3F0000000001, qinv_4 = 9D00FC96481BCBD7
-	q = 22263325F5639019	i = 4, tmp = DDE73F0000000001, qinv_4 = 132EBAB0FD4ACC29
-	q = 737CC3B40BEA0CF7	i = 4, tmp = 7B683F0000000001, qinv_4 = 9FB73D65EE5074C7
-	q = 684C62D545C00BE7	i = 4, tmp = B5D33F0000000001, qinv_4 = 5E07DD7601B1D7D7
-	q = 2126EB6FE667FCE7	i = 4, tmp = AAE03F0000000001, qinv_4 = CAE38E907F0356D7
-	q = 1820891427D9BD07	i = 4, tmp = B2E71F0000000001, qinv_4 = B482EFB8413F20B7
-	q = D1A225C51193DB07	i = 4, tmp = 09E51F0000000001, qinv_4 = 3614137D1AACA2B7
-	q = 4A3ECAD29E18D929	i = 4, tmp = FBBC1F0000000001, qinv_4 = CA21D65DA4A6D319
-	q = 2AC81373C1540A29	i = 4, tmp = 77351F0000000001, qinv_4 = F19DC03216373219
-	q = 8F5825CDE338E6AF	i = 4, tmp = 6DFF000000000001, qinv_4 = 1C6C7B1DB8D0304F
-	q = 7D1DA6008F74A7EF	i = 4, tmp = 73DF000000000001, qinv_4 = 2CA2572F6FF7670F
-	q =  4E8DB1658ED4D4F	i = 4, tmp = 8FBF000000000001, qinv_4 = A51866DEFC91A9AF
-	q = 1EC2F20EF37E67D1	i = 4, tmp = 59BF000000000001, qinv_4 = C5C3891708E55131
-	q = 71F9D5C4A5FDCC0F	i = 4, tmp = E69F000000000001, qinv_4 = 87EA2E1CAB43A2EF
-	q = 16F6D6C18B3C47F1	i = 4, tmp = D89F000000000001, qinv_4 = FAA8C0D60FB3C911
-	q = 9627357D21E7EE51	i = 4, tmp = 9B7F000000000001, qinv_4 = 9FFE8AF5AD701AB1
-	q = 7310297FED8694EF	i = 4, tmp = 445F000000000001, qinv_4 = 767B2DFFE7D31A0F
-	q = 69932E9430BDA571	i = 4, tmp = F35F000000000001, qinv_4 = 1183F68A3B987B91
-	q = FC2C1E395142152F	i = 4, tmp = 643F000000000001, qinv_4 = 150760FDDDB351CF
-	q = E7A91D805CDE12D1	i = 4, tmp = D03F000000000001, qinv_4 = 85FD09F288958631
-	q = 5B66831EBDD1284F	i = 4, tmp = 963F000000000001, qinv_4 = 24AB8DD59DB4EEAF
-	q = CE267BC009CB14D1	i = 4, tmp = 7F3F000000000001, qinv_4 = D55439C3F161C431
-	q = E61242B2877F82D1	i = 4, tmp = 183F000000000001, qinv_4 = BD821BC1F83A1631
-	q = 6B539B51F5AD1A6F	i = 4, tmp = D11F000000000001, qinv_4 = AD285CE732A3848F
-	q = 7868C6D52355A6DF	i = 4, tmp = FF00000000000001, qinv_4 = 823739FC069F951F
-	q =  67572302F62A6A1	i = 4, tmp = DF00000000000001, qinv_4 = 264AFA0A625ABD61
-	q = 8EBCED6476944BE1	i = 4, tmp = BF00000000000001, qinv_4 = 7CF5F1F607693821
-	q = 85BA357C959BEBA1	i = 4, tmp = 9F00000000000001, qinv_4 = FB6B0F723490B861
-	q = DC18EC457548269F	i = 4, tmp = 5F00000000000001, qinv_4 = 3CA5FDEAE58D755F
-	q = 9E0174AF051FB561	i = 4, tmp = 5F00000000000001, qinv_4 = 9A16886C9827AEA1
-	q = D9BC5D7F7E467961	i = 4, tmp = 5F00000000000001, qinv_4 = 24C89FE7CD9BEAA1
-	q = D3CF2CA56E53F7DF	i = 4, tmp = 3F00000000000001, qinv_4 = 4A660935C0BA841F
-	q = D343569BF7A681A1	i = 4, tmp = 1F00000000000001, qinv_4 = 3E9A7341A241A261
-	q = C0BB2BB9DA911E5F	i = 4, tmp = 1F00000000000001, qinv_4 = 2EEFE419D4CEBD9F
-	q = 45DE8690D11C8DA1	i = 4, tmp = 1F00000000000001, qinv_4 = 497A3304C7F29661
-	q = EC9E50DF4764663F	i = 4, tmp = 0000000000000001, qinv_4 = 3B5C754866A089BF
-	
-	So q == 7 or 9 (mod 16) gives 5 good bits in qinv_0 ==> 40 good bits in q*qinv_3 product computed on final iteration.
-	   q == 1 of F (mod 16) gives >= 6 bits in qinv_0, with odds of x good bits roughly halving with each increment in x.
-	
-	But what about the "missing moduli", i.e. q == 3,5,B,D (mod 16)?
-	
-	Tried those by twiddling the bottom 4 bits of the same dataset, all give >= 5 bits.
-	But interestingly, the mod-16 values which give "at least 5" (3,7,9,13) *never* seem to give more than 5,
-	whereas the ones which give "at least 6" (1,5,11,15) all give 6,7,8,... good bits with probability
-	halving with each additional "lucky" bit of precision.
-	*/
-#endif
-
-	// 07/31/2012: Test code using random inputs to test why "missing borrows" do not hose
-	// the loop-folded quotient algorithm (M(p) bad in this context since all the digits = B-1):
-	vec_len = 20;
-	for(i = 0; i < vec_len; i++)
-	{
-		x[i]  = rng_isaac_rand();
-	}
-	mod = 16357897499336320049ull;
-	rem = mi64_div_by_scalar64(x, mod, vec_len, u);
-	ASSERT(HERE, rem == mi64_div_by_scalar64_u2(x, mod, vec_len, v) && mi64_cmp_eq(u,v,vec_len), "mi64_div_by_scalar64 Test #0.a!");
-	ASSERT(HERE, rem == mi64_div_by_scalar64_u4(x, mod, vec_len, v) && mi64_cmp_eq(u,v,vec_len), "mi64_div_by_scalar64 Test #0.b!");
-	
-	/**** 5/16/2012: Tmp-code to test Montgomery-mul remainder for M977, see if can efficiently convert
-	carryout of my RL divisibility algo to true remainder:
-	*/
-	p = 977;	lenX = (p>>6);
-	memset(x,0xff,(lenX<<3));	x[lenX++] = (1ull << (p&63)) - 1;
-	// Try an odd modulus close to 2^64:
-	mod = 16357897499336320049ull;
-	rem =  8623243291871090711ull;
-	ASSERT(HERE, !mi64_is_div_by_scalar64   (x, mod, lenX), "mi64_is_div_by_scalar64 Test #0.1!");
-	ASSERT(HERE, !mi64_is_div_by_scalar64_u2(x, mod, lenX), "mi64_is_div_by_scalar64 Test #0.2!");
-	ASSERT(HERE, !mi64_is_div_by_scalar64_u4(x, mod, lenX), "mi64_is_div_by_scalar64 Test #0.4!");
-	ASSERT(HERE, rem == mi64_div_by_scalar64   (x, mod, lenX, y), "mi64_div_by_scalar64 Test #0.1!");
-	ASSERT(HERE, rem == mi64_div_by_scalar64_u2(x, mod, lenX, y), "mi64_div_by_scalar64 Test #0.2!");
-	ASSERT(HERE, rem == mi64_div_by_scalar64_u4(x, mod, lenX, y), "mi64_div_by_scalar64 Test #0.4!");
-	
-	// A second odd modulus: Use x/2 for modding here to help debug the even-modulus code tested by the next case:
-	mod = 8040689323464953445ull;
-	rem =    2985496175289855ull;
-	mi64_shrl_short(x, y, 1, lenX);	// y = x>>1
-	ASSERT(HERE, mi64_div_by_scalar64(y, mod, lenX, y) == rem, "mi64_div_by_scalar64 Test #1!");
-	// Use that to create an even modulus
-	mod=2*8040689323464953445ull;
-	rem =    5970992350579711ull;
-	ASSERT(HERE, mi64_div_by_scalar64(x, mod, lenX, y) == rem
-				&& !mi64_mul_scalar(y, mod, y, lenX)
-				&& (x[0] == y[0] + rem) && mi64_cmp_eq(x+1,y+1,lenX-1), "mi64_div_by_scalar64 Test #2!");
-	// And a modulus divisible by 4:
-	mod = 1635789749933632004ull;
-	rem =  816586808693101463ull;
-	ASSERT(HERE, mi64_div_by_scalar64(x, mod, lenX, y) == rem
-				&& !mi64_mul_scalar(y, mod, y, lenX)
-				&& (x[0] == y[0] + rem) && mi64_cmp_eq(x+1,y+1,lenX-1), "mi64_div_by_scalar64 Test #3!");
-	// And a modulus divisible by 8:
-	mod=2*1635789749933632004ull;
-	rem =  816586808693101463ull;
-	ASSERT(HERE, mi64_div_by_scalar64(x, mod, lenX, y) == rem
-				&& !mi64_mul_scalar(y, mod, y, lenX)
-				&& (x[0] == y[0] + rem) && mi64_cmp_eq(x+1,y+1,lenX-1), "mi64_div_by_scalar64 Test #4!");
-	// And a modulus divisible by 16:
-	mod=4*1635789749933632004ull;
-	rem =  4088166308560365471ull;
-	ASSERT(HERE, mi64_div_by_scalar64(x, mod, lenX, y) == rem
-				&& !mi64_mul_scalar(y, mod, y, lenX)
-				&& (x[0] == y[0] + rem) && mi64_cmp_eq(x+1,y+1,lenX-1), "mi64_div_by_scalar64 Test #5!");
-	// And a modulus divisible by 32:
-	mod=8*1635789749933632004ull;
-	rem =  4088166308560365471ull;
-	ASSERT(HERE, mi64_div_by_scalar64(x, mod, lenX, y) == rem
-			&& !mi64_mul_scalar(y, mod, y, lenX)
-			&& (x[0] == y[0] + rem) && mi64_cmp_eq(x+1,y+1,lenX-1), "mi64_div_by_scalar64 Test #6!");
-
-	/* Test shift routines: */
-	vec_len = MAX_ARRAY_DIM;
-	for(i = 0; i < vec_len; i++)
-	{
-		x[i]  = rng_isaac_rand();
-	}
-	for(i = 0; i < 100; ++i)
-	{
-		j = (vec_len * 65)*rng_isaac_rand_double_norm_pos();
-		if(j > (vec_len << 6))
-		{
-			mi64_shl (x, u, j, vec_len);	ASSERT(HERE, mi64_iszero(u, vec_len), "u != 0");
-			mi64_shrl(x, v, j, vec_len);	ASSERT(HERE, mi64_iszero(v, vec_len), "v != 0");
-		}
-		else
-		{
-			mi64_shrl(x, v, j, vec_len);
-			mi64_shl (v, v, j, vec_len);
-											j = (vec_len << 6) - j;
-			mi64_shl (x, u, j, vec_len);
-			mi64_shrl(u, u, j, vec_len);
-			mi64_add (u, v, v, vec_len);
-			ASSERT(HERE, mi64_cmp_eq(x, v, vec_len), "x != v");
-		}
-	}
-
-	/*
-	Test #0: check the uint64[] <==> scalar double interconversion routine:
-	*/
-	vec_len = 1;
-	x[0] = 0ull;
-	dbl = mi64_cvt_double(x, vec_len);
-	ASSERT(HERE, fabs(dbl - 0.0) < 1e-10, "GCD self-test 0a");
-
-	x[0] = 1ull;
-	dbl = mi64_cvt_double(x, vec_len);
-	ASSERT(HERE, fabs(dbl - 1.0) < 1e-10, "GCD self-test 0b");
-
-	x[0] = 2ull;
-	dbl = mi64_cvt_double(x, vec_len);
-	ASSERT(HERE, fabs(dbl - 2.0) < 1e-10, "GCD self-test 0c");
-
-	x[0] = 16357897499336320021ull;
-	dbl = mi64_cvt_double(x, vec_len);
-	ASSERT(HERE, fabs(dbl - 16357897499336320021.0)/16357897499336320021.0 < 1e-10, "GCD self-test 0d");
-
-	/* x = 46189291499145878679976776583847887373 */
-	vec_len = 2;
-	x[0] = 14921004030550884877ull;
-	x[1] =  2503926509447010181ull;
-	dbl = mi64_cvt_double(x, vec_len);
-	ASSERT(HERE, fabs(dbl - 46189291499145878679976776583847887373.0)/46189291499145878679976776583847887373.0 < 1e-10, "GCD self-test 0e");
-/*
-b=2^64;t=1;x=2421895350930534175*t;t*=b;x+=2416981188099281967*t;t*=b;x+=17119718852038027230*t;t*=b;x+=792133985803089581*t;t*=b;x+=2967675029189239758*t;t*=b;x+=3104517425838997392*t;t*=b;x+=16792572262743320*t;t*=b;x+=1917901379003983519*t;t*=b;x+=15810729283012914840*t;t*=b;x+=10466991815671365502
-*/
-	mi64_set_eq(y, y10, 10);
-	mi64_add_scalar(y, 1ull, y, 10);	/* p+1 */
-	ASSERT(HERE, mi64_is_div_by_scalar64(y,     30ull, 10) == TRUE, "GCD self-test #0g");
-	ASSERT(HERE, mi64_is_div_by_scalar64(y,    997ull, 10) == TRUE, "GCD self-test #0h");
-	ASSERT(HERE, mi64_is_div_by_scalar64(y,   2113ull, 10) == TRUE, "GCD self-test #0i");
-	ASSERT(HERE, mi64_is_div_by_scalar64(y,  87643ull, 10) == TRUE, "GCD self-test #0j");
-	ASSERT(HERE, mi64_is_div_by_scalar64(y, 219607ull, 10) == TRUE, "GCD self-test #0k");	i = (uint32)6; i *= 2113; i *= 219607;	/* Product of selected small factors slightly below 2^64 */
-	ASSERT(HERE, mi64_is_div_by_scalar64(y, (uint64)i, 10) == TRUE, "GCD self-test #0l");	i = (uint32)87643;
-	ASSERT(HERE, mi64_is_div_by_scalar64(y,5ull*997*i, 10) == TRUE, "GCD self-test #0m");
-
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y,     30, 10) == TRUE, "GCD self-test #0n");
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y,    997, 10) == TRUE, "GCD self-test #0o");
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y,   2113, 10) == TRUE, "GCD self-test #0p");
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y,  87643, 10) == TRUE, "GCD self-test #0q");
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y, 219607, 10) == TRUE, "GCD self-test #0r");	i = (uint32)6; i *= 2113; i *= 219607;	/* Product of selected small factors slightly below 2^32 */
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y,      i, 10) == TRUE, "GCD self-test #0s");	i = (uint32)87643;
-	ASSERT(HERE, mi64_is_div_by_scalar32((uint32 *)y,5*997*i, 10) == TRUE, "GCD self-test #0t");
-
-	/************* Test DIV1: shakedown tests of non-folded DIV algo: ****************/
-	printf("Performing GCD self-test DIV1...\n");
-	vec_len = 1;
-	while(vec_len <= 1024)
-	{
-		for(j=0;j<100;++j)
-		{
-			/* Init an array of (vec_len) quasirandom 64-bit words: */
-			for(i = 0; i < vec_len; i++)
-			{
-				x[i] = y[i] = rng_isaac_rand();
-				tmp64 = rng_isaac_rand();
-			}
-			x[vec_len] = mi64_mul_scalar(x, tmp64, x, vec_len);
-			ASSERT(HERE, mi64_is_div_by_scalar64(x, tmp64, vec_len+1) == TRUE, "GCD self-test #DIV1a!");
-			rem = mi64_div_by_scalar64(x, tmp64, vec_len+1, x);
-			ASSERT(HERE, rem == 0ull && mi64_cmp_eq(x,y,vec_len) == TRUE, "GCD self-test #DIV1b!");
-		}
-		vec_len *= 2;
-		/* Want to do at least one test right at the max. length: */
-		if(vec_len > MAX_ARRAY_DIM)
-		{
-			if(vec_len >= 2*MAX_ARRAY_DIM)
-				break;
-			else
-				vec_len = MAX_ARRAY_DIM;
-		}
-	}
-
-	/************* Test DIV2: shakedown tests of loop-folded DIV algo: ****************/
-	printf("Performing GCD self-test DIV2...\n");	fflush(stdout);
-	vec_len = MAX_ARRAY_DIM;
-	for(i = 0; i < vec_len; i++) {
-		x[i] = rng_isaac_rand();
-	}
-
-	// Correctness tests of short (< 1-word) shifts:
-	// rshift:
-	for(vec_len = 1; vec_len < 1000; vec_len++) 	{
-		tmp64 = x[vec_len];
-		j =  tmp64     & 0x3f;	// 63 bits = max shift count for short-shift routines
-		kk= (tmp64>>8) & 0x3f;	// Also randomly move start-word around between x[0-63] to test alignment-based code
-		// We do in-place shifts below, so copy x-vectar data into working arrays y and z:
-		memcpy(y+kk,x,vec_len<<3);	memcpy(z+kk,x,vec_len<<3);	
-		tmp64 = mi64_shrl_short_ref(y+kk,y+kk,j,vec_len);
-		tmp   = mi64_shrl_short    (z+kk,z+kk,j,vec_len);
-		ASSERT(HERE, tmp == tmp64 && mi64_cmp_eq(z+kk,y+kk,vec_len), "short-rshift test fails!");
-	}
-	// lshift:
-	for(vec_len = 1; vec_len < 1000; vec_len++) 	{
-		tmp64 = x[vec_len];
-		j =  tmp64     & 0x3f;	// 63 bits = max shift count for short-shift routines
-		kk= (tmp64>>8) & 0x3f;	// Also randomly move start-word around between x[0-63] to test alignment-based code
-		// We do in-place shifts below, so copy x-vectar data into working arrays y and z:
-		memcpy(y+kk,x,vec_len<<3);	memcpy(z+kk,x,vec_len<<3);	
-		tmp64 = mi64_shl_short_ref(y+kk,y+kk,j,vec_len);
-		tmp   = mi64_shl_short    (z+kk,z+kk,j,vec_len);
-		ASSERT(HERE, tmp == tmp64 && mi64_cmp_eq(z+kk,y+kk,vec_len), "short-lshift test fails!");
-	}
-
-	// Performance tests of short (< 1-word) shifts:
-	vec_len = 1000;
-	memcpy(y,x,vec_len<<3);	ASSERT(HERE, y[0] != 0 && y[0] == x[0], "init error!");
-	j = 1000000;
-	// I/0 vectors have same alignment:
-	clock1 = clock();
-	tmp = 0ull;
-	for(i = 0; i < j; i++) 	{
-		tmp64 = mi64_shrl_short(y,z,63,vec_len);	// 63 bits = max shift count for short-shift routines
-		tmp  += mi64_shl_short (z,y,63,vec_len);	y[0] += (tmp64 >> 1);	// Add rshift off-shift bits back into low word, cy = 0 assured
-	}
-	ASSERT(HERE, mi64_cmp_eq(x,y,vec_len), "short-shift test fails!");
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("64-bit short-shift, #words = %llu, Time =%s, (sum = %llu)\n",(uint64)j*vec_len*2,get_time_str(tdiff),tmp);
-
-	// I/0 vectors have 8-byte-offset alignment:
-	clock1 = clock();
-	tmp = 0ull;
-	for(i = 0; i < j; i++) 	{
-		tmp64 = mi64_shrl_short(y,z+1,63,vec_len);	// 63 bits = max shift count for short-shift routines
-		tmp  += mi64_shl_short (z+1,y,63,vec_len);	y[0] += (tmp64 >> 1);	// Add rshift off-shift bits back into low word, cy = 0 assured
-	}
-	ASSERT(HERE, mi64_cmp_eq(x,y,vec_len), "short-shift test fails!");
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("64-bit short-shift, 8-byte-offset, Time =%s, (sum = %llu)\n",get_time_str(tdiff),tmp);
-
-	// 64-bit Montgomery inverse, several versions:
-	q = x[0];	q |= 1;
-	clock1 = clock();
-	tmp = 0ull;
-	j = 100000000;
-	while(j--) {
-		qinv = (q+q+q) ^ (uint64)2;
-		for(i = 0; i < 4; i++) {
-			qinv = qinv*((uint64)2 - q*qinv);
-		}
-		q += 2;	tmp += qinv;
-	}	// Core2: ~19.8 cycles; no faster unrolling the inner 4-pass loop
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("64-bit Mont-inverse v1, #calls = %u, Time =%s, (sum = %llu)\n",j,get_time_str(tdiff),tmp);
-
-	q = x[0];	q |= 1;
-	clock1 = clock();
-	tmp = 0ull;
-	j = 100000000;
-	while(j--) {
-		qinv = minv8[(q&0xff)>>1];
-		for(i = 0; i < 3; i++) {
-			qinv = qinv*((uint64)2 - q*qinv);
-		}
-		q += 2;	tmp += qinv;
-	}	// Core2: ~14.3 cycles, 5.5 cycles faster than above v1
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("64-bit Mont-inverse v2, #calls = %u, Time =%s, (sum = %llu)\n",j,get_time_str(tdiff),tmp);
-
-	uint32 q32,qi32;
-	q = x[0];	q |= 1;
-	clock1 = clock();
-	tmp = 0ull;
-	j = 100000000;
-	while(j--) {
-		q32  = q; qi32 = minv8[(q&0xff)>>1];
-		qi32 = qi32*((uint32)2 - q32*qi32);
-		qi32 = qi32*((uint32)2 - q32*qi32);	qinv = qi32;
-		qinv = qinv*((uint64)2 - q*qinv);
-		q += 2;	tmp += qinv;
-	}	// Core2: ~12 cycles, ~8 cycles faster than v1
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("64-bit Mont-inverse v3, #calls = %u, Time =%s, (sum = %llu)\n",j,get_time_str(tdiff),tmp);
-
-	// Correctness (w.r.to the internal fault handlers) tests of the radix_power64() routine,
-	// using random 32/48/64-bit moduli (by which we mean < 2^32/48/64, respectively):
-	kk = 10000000;
-	// q < 2^32:
-	clock1 = clock();
-	tmp = 0ull;
-	j = kk;
-	while(j--) {
-		q = rng_isaac_rand() >> 32; q |= 1;	// Force odd
-		qinv = (q+q+q) ^ (uint64)2;
-		for(i = 0; i < 4; i++) {
-			qinv = qinv*((uint64)2 - q*qinv);
-		}
-		tmp += radix_power64(q,qinv,2);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("Shakedown-test of RADIX_POWER64, q < 2^32, #calls = %u (sum = %llu), Time =%s\n",kk,tmp,get_time_str(tdiff));
-
-	// q < 2^48:
-	clock1 = clock();
-	tmp = 0ull;
-	j = kk;
-	while(j--) {
-		q = rng_isaac_rand() >> 16; q |= 1;	// Force odd
-		qinv = (q+q+q) ^ (uint64)2;
-		for(i = 0; i < 4; i++) {
-			qinv = qinv*((uint64)2 - q*qinv);
-		}
-		tmp += radix_power64(q,qinv,2);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("Shakedown-test of RADIX_POWER64, q < 2^48, #calls = %u (sum = %llu), Time =%s\n",kk,tmp,get_time_str(tdiff));
-
-	// q < 2^64:
-	clock1 = clock();
-	tmp = 0ull;
-	j = kk;
-	while(j--) {
-		q = rng_isaac_rand(); q |= 1;	// Force odd
-		qinv = (q+q+q) ^ (uint64)2;
-		for(i = 0; i < 4; i++) {
-			qinv = qinv*((uint64)2 - q*qinv);
-		}
-		tmp += radix_power64(q,qinv,2);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("Shakedown-test of RADIX_POWER64, q < 2^64, #calls = %u (sum = %llu), Time =%s\n",kk,tmp,get_time_str(tdiff));
-
-	// Timing tests of key parts of the radix_power64() routine:
-	q = x[0];	q |= 1;
-	qinv = (q+q+q) ^ (uint64)2;
-	for(i = 0; i < 4; i++) {
-		qinv = qinv*((uint64)2 - q*qinv);
-	}
-
-	// 64-bit stdlib mod:
-	j = 100000000;
-	clock1 = clock();
-	tmp = 0ull;
-	for(i = 0; i < j; i++) 	{
-		tmp += x[i&0xffff] % q;
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("64-bit stdlib mod, #calls = %u, Time =%s, (sum = %llu)\n",j,get_time_str(tdiff),tmp);
-
-	// MONT_SQR64() macro:
-	clock1 = clock();
-	tmp = 0ull;	// Accumulate sum of outputs in loop to prevent compiler from optimizing away the macro as a no-op:
-	for(i = 0; i < j; i++) 	{
-		MONT_SQR64(x[1],q,qinv,rem);	tmp += rem;
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("MONT_SQR64, #calls = %u, Time =%s, (sum = %llu)\n",j,get_time_str(tdiff),tmp);
-
-	// 64-bit odd modulus q, B^2 mod q:
-	clock1 = clock();
-	for(i = 0; i < j; i++) 	{
-		tmp = radix_power64(q,qinv,2);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("RADIX_POWER64, B^2 mod q, 64-bit modulus, #calls = %u, Time =%s\n",j,get_time_str(tdiff));
-
-	// 64-bit odd modulus q, B^1000 mod q:
-	j = 10000000;	// ~10x slower than B^2, so cut loop count by 10
-	clock1 = clock();
-	for(i = 0; i < j; i++) 	{
-		tmp = radix_power64(q,qinv,1000);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("RADIX_POWER64, B^1000 mod q, 64-bit modulus, #calls = %u, Time =%s\n",j,get_time_str(tdiff));
-
-	// 64-bit odd modulus q, B^1000000 mod q:
-	j = 5000000;	// ~2x slower than B^1000, so cut loop count by 2
-	clock1 = clock();
-	for(i = 0; i < j; i++) 	{
-		tmp = radix_power64(q,qinv,1000000);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("RADIX_POWER64, B^1000000 mod q, 64-bit modulus, #calls = %u, Time =%s\n",j,get_time_str(tdiff));
-
-	tmp = 0ull;	// array-length accumulator
-	// Use a smaller max. length for this time-consuming series of correctness tests:
-	for(i = 1; i < 1000; i++)
-	{
-		tmp += i;
-		mod = rng_isaac_rand();
-	//	printf("Len = %u, q = %16llu...\n",i,mod);
-		rem   = mi64_div_by_scalar64   (x, mod, i, y);	mi64_mul_scalar(y, mod, z, i);	mi64_add_scalar(z, rem, z, i);	ASSERT(HERE, mi64_cmp_eq(x, z, i), "GCD self-test #DIV2_u1");
-		tmp64 = mi64_div_by_scalar64_u2(x, mod, i, z);	ASSERT(HERE, rem == tmp64 && mi64_cmp_eq(y,z,i), "GCD self-test #DIV2_u2");
-		tmp64 = mi64_div_by_scalar64_u4(x, mod, i, z);	ASSERT(HERE, rem == tmp64 && mi64_cmp_eq(y,z,i), "GCD self-test #DIV2_u4");
-	}
-	printf("DIVREM u1|u2|u4 correctness self-test passed. Total vector Len = %llu\n",tmp);
-
-	// Remainder-only, odd moduli:
-	j = 300000;	//*** Note that the following 4 tests have work increasing with the *square* of the loop bound
-	clock1 = clock();
-	tmp = 0ull;	// array-length accumulator
-	for(i = 4; i < j; i += 40 /*(((i>>2)+3) & 0xfffffffc)*/)	// Increment must keep i a multiple of 4
-	{
-		tmp += i;
-		mod = rng_isaac_rand();
-		mod |= 1;	// Force odd  modulus
-	//	printf("Len = %u, q = %16llu...\n",i,mod);
-		mi64_div_by_scalar64_u4(x, mod, i, 0x0);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("\tREM 4-fold performance test, odd -moduli: Total vector Len = %llu, Time =%s\n",tmp,get_time_str(tdiff));
-
-	// Remainder-only, even moduli:
-	clock1 = clock();
-	tmp = 0ull;	// array-length accumulator
-	for(i = 4; i < j; i += 40 /*(((i>>2)+3) & 0xfffffffc)*/)	// Increment must keep i a multiple of 4
-	{
-		tmp += i;
-		mod = rng_isaac_rand();
-		mod &= 0xFFFFFFFFFFFFFFFE;	// Force even modulus
-	//	printf("Len = %u, q = %16llu...\n",i,mod);
-		mi64_div_by_scalar64_u4(x, mod, i, 0x0);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("\tREM 4-fold performance test, even-moduli: Total vector Len = %llu, Time =%s\n",tmp,get_time_str(tdiff));
-
-	// Remainder and quotient, odd moduli:
-	clock1 = clock();
-	tmp = 0ull;	// array-length accumulator
-	for(i = 4; i < j; i += 40 /*(((i>>2)+3) & 0xfffffffc)*/)	// Increment must keep i a multiple of 4
-	{
-		tmp += i;
-		mod = rng_isaac_rand();
-		mod |= 1;	// Force odd  modulus
-	//	printf("Len = %u, q = %16llu...\n",i,mod);
-		mi64_div_by_scalar64_u4(x, mod, i, z);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("DIV+REM 4-fold performance test, odd -moduli: Total vector Len = %llu, Time =%s\n",tmp,get_time_str(tdiff));
-
-	// Remainder and quotient, even moduli:
-	clock1 = clock();
-	tmp = 0ull;	// array-length accumulator
-	for(i = 4; i < j; i += 40 /*(((i>>2)+3) & 0xfffffffc)*/)	// Increment must keep i a multiple of 4
-	{
-		tmp += i;
-		mod = rng_isaac_rand();
-		mod &= 0xFFFFFFFFFFFFFFFE;	// Force even modulus
-	//	printf("Len = %u, q = %16llu...\n",i,mod);
-		mi64_div_by_scalar64_u4(x, mod, i, z);
-	}
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf("DIV+REM 4-fold performance test, even-moduli: Total vector Len = %llu, Time =%s\n",tmp,get_time_str(tdiff));
-	/********************************************************************************/
-exit(0);
-	/*
-	Test #1: check the uint64[] pure-integer multiply routines - these use grammar-school algorithm:
-	*/
-	vec_len = 1;
-	while(vec_len <= 1024)
-	{
-		/* Init an array of (vec_len) quasirandom 64-bit words: */
-		for(i = 0; i < vec_len; i++)
-		{
-			x[i] = rng_isaac_rand();
-			y[i] = rng_isaac_rand();
-		}
-
-		mi64_mul_vector(x, vec_len, y, vec_len, z, &lenZ);
-		mi64_mul_vector_lo_half(x, y, u, vec_len);
-		ASSERT(HERE, mi64_cmp_eq(u, z, vec_len), "GCD self-test 1: mi64_mul_vector_lo_half");
-		mi64_mul_vector_hi_half(x, y, v, vec_len);
-		ASSERT(HERE, mi64_cmp_eq(v, &z[vec_len], vec_len), "GCD self-test 1: mi64_mul_vector_hi_half");
-
-		/******************************************************************/
-		/* ****TODO**** Compare pure-integer x*y with FFT-based multiply: */
-		/******************************************************************/
-
-		vec_len *= 2;
-		/* Want to do at least one test right at the max. length: */
-		if(vec_len > MAX_ARRAY_DIM)
-		{
-			if(vec_len >= 2*MAX_ARRAY_DIM)
-				break;
-			else
-				vec_len = MAX_ARRAY_DIM;
-		}
-	}
-
-	/*
-	Test #2: let p = 16357897499336320021 (64-bit test prime), check that mi64_pprimeF
-	to various small-prime bases returns TRUE:
-	*/
-	printf("\nPerforming GCD self-test #2...\n");
-	vec_len = 1;
-	x[0] = 43112609ull;
-	ASSERT(HERE, pprimeF   ((uint32)x[0], 2ull), "GCD self-test #2,02");
-	ASSERT(HERE, mi64_pprimeF(x, 2ull, vec_len), "GCD self-test #2, 2");
-	y[0] = 16357897499336320021ull;
-	ASSERT(HERE, mi64_div_y32(y, (uint32)x[0], u, 1) == 7915399 && u[0] == 379422583758ull, "GCD self-test #3f");
-	ASSERT(HERE, mi64_pprimeF(y, 2ull, vec_len), "GCD self-test #2, 2");
-	ASSERT(HERE, mi64_pprimeF(y, 3ull, vec_len), "GCD self-test #2, 3");
-	ASSERT(HERE, mi64_pprimeF(y, 5ull, vec_len), "GCD self-test #2, 5");
-	ASSERT(HERE, mi64_pprimeF(y, 7ull, vec_len), "GCD self-test #2, 7");
-	ASSERT(HERE, mi64_pprimeF(y,11ull, vec_len), "GCD self-test #2,11");
-	ASSERT(HERE, mi64_pprimeF(y,13ull, vec_len), "GCD self-test #2,13");
-	ASSERT(HERE, mi64_pprimeF(y,17ull, vec_len), "GCD self-test #2,17");
-	ASSERT(HERE, mi64_pprimeF(y,19ull, vec_len), "GCD self-test #2,19");
-	ASSERT(HERE, mi64_pprimeF(y,23ull, vec_len), "GCD self-test #2,23");
-	ASSERT(HERE, mi64_pprimeF(y,29ull, vec_len), "GCD self-test #2,29");
-
-	/*
-	Test #2a: 128-bit test primes:
-	*/
-	vec_len = 2;
-	for(j = 0; testp128[j].d0 != 0; ++j)
-	{
-		x[0] = testp128[j].d0;	x[1] =  testp128[j].d1;
-		/*
-		fprintf(stderr, "mi64_pprimeF: testing p = %s\n", &char_buf[convert_uint128_base10_char(char_buf, testp128[j], 0)]);
-		*/
-		ASSERT(HERE, mi64_pprimeF(x, 3ull, vec_len) == 1, "GCD self-test #2a");
-	}
-
-	/*
-	Test #2b: 192-bit test primes:
-	*/
-	vec_len = 3;
-	for(j = 0; testp192[j].d0 != 0; ++j)
-	{
-		x[0] = testp192[j].d0;	x[1] =  testp192[j].d1;	x[2] =  testp192[j].d2;
-		/*
-		fprintf(stderr, "mi64_pprimeF: testing p = %s\n", &char_buf[convert_uint192_base10_char(char_buf, testp192[j], 0)]);
-		*/
-		ASSERT(HERE, mi64_pprimeF(x, 3ull, vec_len) == 1, "GCD self-test #2b");
-	}
-
-	ASSERT(HERE, mi64_pprimeF(y1 , 3ull, 1 ) == 1, "GCD self-test #2.1 ");
-	ASSERT(HERE, mi64_pprimeF(y2 , 3ull, 2 ) == 1, "GCD self-test #2.2 ");
-	ASSERT(HERE, mi64_pprimeF(y3 , 3ull, 3 ) == 1, "GCD self-test #2.3 ");
-	ASSERT(HERE, mi64_pprimeF(y4 , 3ull, 4 ) == 1, "GCD self-test #2.4 ");
-	ASSERT(HERE, mi64_pprimeF(y5 , 3ull, 5 ) == 1, "GCD self-test #2.5 ");
-	ASSERT(HERE, mi64_pprimeF(y6 , 3ull, 6 ) == 1, "GCD self-test #2.6 ");
-	ASSERT(HERE, mi64_pprimeF(y7 , 3ull, 7 ) == 1, "GCD self-test #2.7 ");
-	ASSERT(HERE, mi64_pprimeF(y8 , 3ull, 8 ) == 1, "GCD self-test #2.8 ");
-	ASSERT(HERE, mi64_pprimeF(y9 , 3ull, 9 ) == 1, "GCD self-test #2.9 ");
-	ASSERT(HERE, mi64_pprimeF(y10, 3ull, 10) == 1, "GCD self-test #2.10");
-
-	/*
-	Test #3: Test mi64 scalar-mul routines:
-	let q = 16357897499336320021;
-	x = q * {some quasi-random multiword int D}
-	u,v,x,y available for storage.
-	*/
-	printf("\nPerforming GCD self-test #3...\n");
-	vec_len = 1;
-	for(j = 0; j < 10; ++j)
-	{
-		vec_len = 100*j + 1;
-		for(i = 0; i < vec_len; i++)
-		{
-			x[i] = rng_isaac_rand();
-		}
-		mi64_set_eq(y, x, vec_len);									/* y = D */
-		x[vec_len] = mi64_mul_scalar(x, 16357897499336320021ull, x, vec_len);	/* x = q*D */
-		++vec_len;	/* Changed x[vec_len++] = ... to a separate increment step to avoid order-of-eval ambiguity. */
-		ASSERT(HERE, mi64_getlen(x, vec_len) == vec_len , "GCD self-test #3: expected nonzero carryout of mi64_mul_scalar!");
-		mi64_set_eq_scalar(u, 16357897499336320021ull, vec_len);	/* u = q */
-		mi64_div(x, u, vec_len, vec_len, v, x);	/* In-place x/q: dividend put into v, remainder back into x */
-		ASSERT(HERE, mi64_cmp_eq       (y,  v, vec_len-1), "GCD self-test #3: (x*q)/q != x");
-		ASSERT(HERE, mi64_cmp_eq_scalar(x, 0ull, vec_len), "GCD self-test #3: (x*q)%q != 0");
-	}
-#if 0
-	/*
-	Test #4: Test mi64 scalar-div and remainder routines:
-	x = p * {some quasi-random (N-1)-word int}
-	y =     {some quasi-random  N   -word prime}, N of various lengths for which we have prestored multiword primes.
-	*/
-	printf("\nPerforming GCD self-test #4...\n");
-	mi64_set_eq(y, y10, 10);
-	mi64_add_scalar(y, 1ull, y, 10);	/* p+1 */
-	ASSERT(HERE, mi64_div_y32(y,    30, 0x0, 10) == 0, "GCD self-test #4a");
-	ASSERT(HERE, mi64_div_y32(y,   997, 0x0, 10) == 0, "GCD self-test #4b");
-	ASSERT(HERE, mi64_div_y32(y,  2113, 0x0, 10) == 0, "GCD self-test #4c");
-	ASSERT(HERE, mi64_div_y32(y, 87643, 0x0, 10) == 0, "GCD self-test #4d");
-	ASSERT(HERE, mi64_div_y32(y,219607, 0x0, 10) == 0, "GCD self-test #4e");
-	i = (uint32)6; i *= 2113; i *= 219607;	/* Product of selected small factors slightly below 2^32 */
-	ASSERT(HERE, mi64_div_y32(y,     i,  u , 10) == 0, "GCD self-test #4f");
-	i = (uint32)87643;
-	ASSERT(HERE, mi64_div_y32(u,5*997*i, u , 10) == 0, "GCD self-test #4f");
-
-	mi64_sub_scalar(y, 1ull, y, 10);	/* p */
-	ASSERT(HERE, mi64_div_y32(y, (uint32)3578974993u, u, 10) == (uint32)2756613022u, "GCD self-test #4g");
-	ASSERT(HERE, mi64_mul_scalar(u, (uint32)3578974993u, u, 10) == 0, "GCD self-test #4h");
-	ASSERT(HERE, mi64_add_scalar(u, (uint32)2756613022u, u, 10) == 0, "GCD self-test #4i");
-	ASSERT(HERE, mi64_cmp_eq(u, y10,10), "GCD self-test #4j");
-#endif
-	/*
-	Test #5: let p = 16357897499336320049 [next-higher 64-bit prime above our 1-word reference prime]
-	x = p * {some quasi-random (N-1)-word int}
-	y =     {some quasi-random  N   -word prime}, N of various lengths for which we have prestored multiword primes.
-	*/
-	printf("\nPerforming GCD self-test #5...\n");
-	for(j = 1; j < num_mword_prime; j++)	/* Start multipliers with 2-word primes and go up from there */
-	{
-	#if GCD_DEBUG >= 1
-		if(gcd_debug) printf("\n********************************\n      Length-%u inputs:\n********************************\n",j+1);
-	#endif
-		vec_len = len_mword_prime[j]-1;
-		curr_mword_prime = ptr_mword_prime[j];
-		for(i = 0; i < vec_len; i++) {
-			x[i] = rng_isaac_rand();
-		}
-		x[vec_len] = mi64_mul_scalar(x, 16357897499336320049ull, x, vec_len);
-		ASSERT(HERE, x[vec_len] != 0, "GCD self-test #5: expected nonzero carryout of mi64_mul_scalar!");
-		vec_len += 1;
-		// Toggle this #if value from 1 to 0 to enable code to auto-generate more fixed-length base-2 PRPs:
-	#if 1
-		for(i = 0; i < vec_len; i++) {
-			y[i] = curr_mword_prime[i];
-		}
-	#else
-		// Start with a random n-word int, fiddle LSB to make odd, then sub-2 until find a 2-PRP:
-		for(i = 0; i < vec_len; i++) {
-			y[i] = rng_isaac_rand();
-		}	y[0] |= 1ull;
-		uint32 max_try = 10000, curr_p, small_div, ntry = 0;
-		while(ntry < max_try) {
-			mi64_sub_scalar(y, 1ull, z, vec_len);	// z = y-1
-			small_div = 0;	// First trial-divide by the small primes (2-PRPs, actually):
-			for(curr_p = 3; curr_p < 10000*vec_len; curr_p += 2) {
-				if(!pprimeF(curr_p,2)) continue;
-				if(mi64_is_div_by_scalar64(y, curr_p, vec_len)) {
-					small_div = 1;	//	printf("Found small divisor %u\n",curr_p);
-					break;
-				}
-			}
-			if(!small_div && ntry++ && mi64_twopmodq(z, vec_len, 0, y, vec_len, 0x0) == 1) {
-				printf("[%u]th try: Found %u-word 2-PRP = { ",ntry,vec_len);	for(i = 0; i < vec_len-1; i++) { printf("%lluull, ",y[i]); }	printf("%lluull };\n",y[i]);
-				break;
-				// This requires y to have no more digits than STR_MAX_LEN = 1024, i.e. ~50 words:
-			//	printf("[%u]th try: Found %u-word 2-PRP = %s\n",i,vec_len,&string0[convert_mi64_base10_char(string0, y,vec_len, 0)]);
-			}
-			mi64_sub_scalar(y, 2ull, y, vec_len);	// y -= 2
-		}
-		ASSERT(HERE, i < max_try, "No 2-PRP found!");
-	#endif
-//	gcd_debug = (vec_len == 100);
-	if(gcd_debug) {
-		// Used this testcase to convince myself that really do need as many trailing 'guard bits' in the leading-order
-		// arrays sections being fed to the eGCD routine as bits in the resulting abcd multipliers to get a resulting
-		// array-length reduction matching the length of the multipliers:
-		// The abcd-mults resulting from this reduced length from lg(u,v) = 6398 to lg(|a*u-b*v|,|c*u-d*v|) = 3233:
-		fail += (mi64_gcd(x, y, vec_len, eGCD = TRUE, Ap,Bp,&len_AB,&sign_AB, HALF = TRUE,Cp,Dp,&len_CD,&sign_CD,vec_len/2) != 1);
-		// The abcd-mults resulting from this reduced length from log(u,v) = 6398 to log() = 5712, -686 bits vs -3165:
-		fail += (mi64_gcd(x+40, y+40, 60, eGCD = TRUE, Ap,Bp,&len_AB,&sign_AB, HALF = TRUE,Cp,Dp,&len_CD,&sign_CD,50) != 1);
-		exit(0);
-	} else
-		fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 1);
-		fail += (x[0] != 1ull);	/* Put this on a separate line because some compilers will return 1 as the function result of evaluating (a+b), i.e. fail += a+b increments fail by 1 even if a = b = 0. Strange but true... */
-		ASSERT(HERE, fail==0,"GCD self-test #5");
-	}
-
-	/*
-	Test #6: let p = 2-word prime
-	y = p * {some quasi-random N-word prime, stored in y}
-	x = p * {some quasi-random N-word int coprime to y[]}
-	*/
-	printf("\nPerforming GCD self-test #6...\n");
-	for(j = 2; j < 10; j++)	/* Start multipliers with 3-word primes and go up from there */
-	{
-		gcd_debug=0;
-		vec_len = len_mword_prime[j];
-		curr_mword_prime = ptr_mword_prime[j];
-		for(i = 0; i < vec_len; i++)
-		{
-			v[i] = curr_mword_prime[i];
-			u[i] = rng_isaac_rand();
-		}
-		ASSERT(HERE, mi64_pprimeF(v, 3ull, vec_len) == 1, "GCD self-test #6.prp");
-		/* vector*vector MULs to get x and y are here: */
-		mi64_mul_vector(y2, 2, v, vec_len, y, &i      );
-		mi64_mul_vector(y2, 2, u, vec_len, x, &vec_len);
-		vec_len = MAX(i, vec_len);
-		fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 2);
-		fail += (mi64_cmp_eq(x, y2, 2) != TRUE);
-		ASSERT(HERE, fail==0,"GCD self-test #6");
-	}
-
-	/* Test #7: Use various small primes to test out the mi64_twopmodq routine: */
-	printf("\nPerforming GCD self-test #7...\n");
-	for(j = 0; j < 10; j++) {
-		vec_len = len_mword_prime[j];
-		curr_mword_prime = ptr_mword_prime[j];
-		mi64_set_eq(x, curr_mword_prime, vec_len);	/* x = p */
-		mi64_sub_scalar(x, 1ull, y, vec_len);			/* y = p-1 */
-		ASSERT(HERE, mi64_twopmodq(y, vec_len, 0, x, vec_len, 0x0) == 1, "GCD self-test #7: mi64_twopmodq != 1");	/* 2^(p-1) ?= 1 (mod p) */
-	}
-
-	/*
-	Test #8: Multiply the following factors of M-numbers together (where w:= 2^64):
-
-		M(    677):    157590042578912*w^2 + 10558642444782195772*w +   329809049266961143;
-		M(    773):      9118322195022*w^2 +  1933308633079010416*w + 17814616685598394119;
-		M(    971):     70286054459973*w^2 + 17012949627558354271*w +  3547755741880899889;
-		M(    997): 492416983078691417*w^2 +  8040689323464953445*w + 16007877010440112335;
-		M(   1001):        59364131986*w^2 +  9565712986615012496*w + 10050950882119470361;
-
-	The product of the 5 M-numbers having these small factors needs P uint64 slots to store in compact form.
-	Create a nontrivial y-vector by multiplying the F 64-bit words of the factor product by (P-F) random 64-bit ints.
-
-	We also use this section to test the mi64 string I/O functionality.
-	*/
-	printf("\nPerforming GCD self-test #8...\n");
-
-	/*** First multiply together all the M-numbers, storing the result in x[]: ***/
-	/* M677: */
-	p = 677;	lenX = (p>>6)+1;
-	memset(x,0xff,(lenX<<3));	x[lenX-1] = (1ull << (p&63)) - 1;
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, x, lenX, 0)], "627057063764139831929324851379409869378845668175598843037877190478889006888518431438644711527536922839520331484815861906173161536477065546885468336421475511783984145060592245840032548652210559519683510271"), "GCD self-test #8");
-	/* q677: */
-	u[2] =    157590042578912ull; u[1] = 10558642444782195772ull; u[0] =   329809049266961143ull;	lenU = 3;
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, u, lenU, 0)], "53625112691923843508117942311516428173021903300344567"), "GCD self-test #8");
-	/* Test the back-conversion (string -> mi64): */
-	ASSERT(HERE, 0x0 != (out_array = convert_base10_char_mi64("53625112691923843508117942311516428173021903300344567", &i)) && (i == 3), "0");
-	ASSERT(HERE, mi64_cmp_eq(out_array, u, 3), "0");	free((void *)out_array);	out_array = 0x0;
-
-	ASSERT(HERE, mi64_pprimeF(u, 3ull, lenU) == 1, "GCD self-test #8.pr1");
-//	mi64_setlen(u, lenU, lenX);	only needed for legacy mi64_div_binary function
-	mi64_div(x, u, lenX, lenU, w, z);	/* Check divisibility directly - quotient returned in w, remainder in z */
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, w, lenX, 0)], "11693347245097298120823316616845179432964234640294687969094035694006198722115249470397309583454281704888388908348322516169275641497746779501202976102713"), "GCD self-test #8");
-	ASSERT(HERE, mi64_iszero(z, lenU) == 1, "GCD div-test #8.pr1");
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, z, lenU, 0)], "0"), "GCD self-test #8");
-
-	/* ... *= M773: */
-	p = 773;	lenY = (p>>6)+1;
-	memset(y,0xff,(lenY<<3));	y[lenY-1] = (1ull << (p&63)) - 1;
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, y, lenY, 0)], "49680578953622685924767343630800081768220352547734291556449665216833630485964060362588109082516687294415607382308194342597490561411674060526217192801317796454542559232667196977608489140211150234408415974198927000028571099322113851391"), "GCD self-test #8");
-	mi64_set_eq(w, x, lenX);	/* Use copy of X as multiplicand 1 so we can store the result back into X */
-	mi64_mul_vector(w, lenX, y, lenY, x, &lenX);
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, x, lenX, 0)], "31152557964761163897928842168277492768812223183099899891668902917976925560029252973898899702640514119214599026359946447505761700950328538576680016552349084733366485698028733665842440380429912269143850331229298040934224944520428395437344934546472616298890629611082440248854931228400162480344449363195362608681181976242618371232062995112351931255191940463434239006436301912973135868237655715193561884369357401722616533384552395786116136961"), "GCD self-test #8");
-	/* ... *= q773: */
-	v[2] =      9118322195022ull; v[1] =  1933308633079010416ull; v[0] = 17814616685598394119ull; lenV = 3;
-	ASSERT(HERE, mi64_pprimeF(v, 3ull, lenV) == 1, "GCD self-test #8.pr2");
-	mi64_set_eq(z, u, lenU);
-	mi64_mul_vector(z, lenU, v, lenV, u, &lenU);
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, u, lenU, 0)], "166388228042876887900053740282475640168826969539724862936313620741285381200003507633306121185343797304769"), "GCD self-test #8");
-//	mi64_setlen(u, lenU, lenX);	only needed for legacy mi64_div_binary function
-	mi64_div(x, u, lenX, lenU, w, z);	/* Check divisibility directly - quotient returned in w, remainder in z */
-	ASSERT(HERE, mi64_iszero(z, lenU) == 1, "GCD div-test #8.pr2");
-
-	/* ... *= M971: */
-	p = 971;	lenY = (p>>6)+1;
-	memset(y,0xff,(lenY<<3));	y[lenY-1] = (1ull << (p&63)) - 1;
-	mi64_set_eq(w, x, lenX);	/* Use copy of X as multiplicand 1 so we can store the result back into X */
-	mi64_mul_vector(w, lenX, y, lenY, x, &lenX);
-	/* ... *= q971: */
-	v[2] =     70286054459973ull; v[1] = 17012949627558354271ull; v[0] =  3547755741880899889ull; lenV = 3;
-	ASSERT(HERE, mi64_pprimeF(v, 3ull, lenV) == 1, "GCD self-test #8.pr3");
-	mi64_set_eq(z, u, lenU);
-	mi64_mul_vector(z, lenU, v, lenV, u, &lenU);
-//	mi64_setlen(u, lenU, lenX);	only needed for legacy mi64_div_binary function
-	mi64_div(x, u, lenX, lenU, w, z);	/* Check divisibility directly - quotient returned in w, remainder in z */
-	ASSERT(HERE, mi64_iszero(z, lenU) == 1, "GCD div-test #8.pr3");
-
-	/* ... *= M997: */
-	p = 997;	lenY = (p>>6)+1;
-	memset(y,0xff,(lenY<<3));	y[lenY-1] = (1ull << (p&63)) - 1;
-	mi64_set_eq(w, x, lenX);	/* Use copy of X as multiplicand 1 so we can store the result back into X */
-	mi64_mul_vector(w, lenX, y, lenY, x, &lenX);
-	/* ... *= q997: */
-	v[2] = 492416983078691417ull; v[1] =  8040689323464953445ull; v[0] = 16007877010440112335ull; lenV = 3;
-	ASSERT(HERE, mi64_pprimeF(v, 3ull, lenV) == 1, "GCD self-test #8.pr3");
-	mi64_set_eq(z, u, lenU);
-	mi64_mul_vector(z, lenU, v, lenV, u, &lenU);
-//	mi64_setlen(u, lenU, lenX);	only needed for legacy mi64_div_binary function
-	mi64_div(x, u, lenX, lenU, w, z);	/* Check divisibility directly - quotient returned in w, remainder in z */
-	ASSERT(HERE, mi64_iszero(z, lenU) == 1, "GCD div-test #8.pr3");
-
-	/* ... *= M1001: */
-	p =1001;	lenY = (p>>6)+1;
-	memset(y,0xff,(lenY<<3));	y[lenY-1] = (1ull << (p&63)) - 1;
-	mi64_set_eq(w, x, lenX);	/* Use copy of X as multiplicand 1 so we can store the result back into X */
-	mi64_mul_vector(w, lenX, y, lenY, x, &lenX);
-	/* ... *= q1001: */
-	v[2] =        59364131986ull; v[1] =  9565712986615012496ull; v[0] = 10050950882119470361ull; lenV = 3;
-	ASSERT(HERE, mi64_pprimeF(v, 3ull, lenV) == 1, "GCD self-test #8.pr4");
-	mi64_set_eq(z, u, lenU);
-	mi64_mul_vector(z, lenU, v, lenV, u, &lenU);
-//	mi64_setlen(u, lenU, lenX);	only needed for legacy mi64_div_binary function
-	mi64_div(x, u, lenX, lenU, w, z);	/* Check divisibility directly - quotient returned in w, remainder in z */
-	ASSERT(HERE, mi64_iszero(z, lenU) == 1, "GCD div-test #8.pr4");
-
-	/*** Lastly, pad the q-product stored in U] to be the same length as the M-product [stored in X]
-	by multiplying the q-product by as many random 64-bit digits as are needed to do so: ***/
-	lenV = lenX - lenU;
-	ASSERT(HERE, lenV <= MAX_ARRAY_DIM, "GCD self-test #8: lenV > MAX_ARRAY_DIM!");
-	for(i = 0; i < lenV; i++) {
-		v[i] = rng_isaac_rand();
-	}
-	mi64_mul_vector(u, lenU, v, lenV, y, &lenY);
-	ASSERT(HERE, lenY >= (lenX - 1), "GCD self-test #8: mi64_mul_vector(u, lenU, v, lenV, y, &lenV) < (vec_len - 1)");
-	mi64_setlen(y, lenY, lenX);
-	gcd_debug=0;
-
-	/* First use the data to test mi64_div - add a 10-word const, quotient returned in w, remainder in z, rem = the const we added */
-	tmp64 = mi64_add(x, y10, x, 10);	mi64_add_scalar(x+10, tmp64, x+10, lenX-10);
-	mi64_div(x, u, lenX, lenU, w, z);
-	ASSERT(HERE, mi64_cmp_eq(z, y10, 10), "GCD div-test #8.pr5: Remainder check failed");
-	// Subtract the 10-word const back off:
-	tmp64 = mi64_sub(x, y10, x, 10);	mi64_sub_scalar(x+10, tmp64, x+10, lenX-10);
-	// ...And check that quotient*divisor == x:
-	mi64_mul_vector(w,lenX,u,lenU,z,&i);
-	ASSERT(HERE, i == lenX, "GCD div-test #8.pr5: Remultiply length check failed");
-	ASSERT(HERE, mi64_cmp_eq(x, z, lenX), "GCD div-test #8.pr5: Remultiply product check failed");
-
-	/* Get the GCD: */
-	vec_len = mi64_gcd(x, y, lenX, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0);
-
-	/* Note that 1001 is in fact composite (7*11*13) so M1001 has many small factors. Thus, since the
-	random multiplier we used to pad the large-prime-factor product ma well share some of those small multipliers,
-	we take a further GCD with the unpadded factor product:
-	*/
-	lenX = vec_len;
-	mi64_setlen(u, lenU, lenX);
-	vec_len = mi64_gcd(x, u, lenX, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0);
-	ASSERT(HERE, vec_len == 14,"GCD self-test #8");
-	fail = 0;
-	fail += (x[ 0] !=  3593788887684319431ull);
-	fail += (x[ 1] != 10937322091449271811ull);
-	fail += (x[ 2] != 12946497122050840313ull);
-	fail += (x[ 3] !=   647314691296882413ull);
-	fail += (x[ 4] != 16358554045068224992ull);
-	fail += (x[ 5] != 11116777718169420623ull);
-	fail += (x[ 6] !=  1469016928937333456ull);
-	fail += (x[ 7] !=  9391229552579049908ull);
-	fail += (x[ 8] !=  5839179435288346904ull);
-	fail += (x[ 9] !=  8876202416807555851ull);
-	fail += (x[10] !=  5810184814721434260ull);
-	fail += (x[11] !=  4707921738239363748ull);
-	fail += (x[12] != 13066869379195810396ull);
-	fail += (x[13] !=         470338845646ull);
-	ASSERT(HERE, fail==0,"GCD self-test #8");
-	ASSERT(HERE, STREQ(&char_buf[convert_mi64_base10_char(char_buf, x, vec_len, 0)], "13469989009602505896761896298151251254620625588479300721021471299084968313453138000611404859588356783789058816019125967988627321691441338348931654165690484559185870012613932145949653360324772652694002027486449514263873807210595986656151451735452053543049459632327"), "GCD self-test #8");
-
-	/*
-	Test #9: M27691 has the 121-bit factor 1734072082042172647364731231822850071.
-	This M-number needs 433 uint64 slots to store in compact form. Create a nontrivial
-	y-vector by multiplying the two 64-bit words of the factor by the 431 LSWs of y4[].
-	*/
-if(MAX_ARRAY_DIM >= 433)
-{
-	printf("\nPerforming GCD self-test #9...\n");
-
-	p = 27691;	vec_len = (p>>6)+1;
-
-	// First try a gcd of 2^p-1 (huge) with just the small factor:
-	memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-	memset(u,(uint64) 0,(vec_len<<3));
-	/* Store q in u[]: */
-	u[0] =  4235226679561594903ull;
-	u[1] =    94004235929829273ull;
-	ASSERT(HERE, mi64_pprimeF(u, 3ull, 2) == 1, "GCD self-test #9.prp");
-	clock1 = clock();
-	gcd_debug=0;
-	fail += (mi64_gcd(x, u, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 2);
-	fail += (x[0] !=  4235226679561594903ull);
-	fail += (x[1] !=    94004235929829273ull);
-	ASSERT(HERE, fail==0,"GCD self-test #9a");
-
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf	("GCD self-test #9a passed: GCD Time =%s\n",get_time_str(tdiff));
-
-	// Now re-init x and u and do full-length gcd:
-	mi64_set_eq(u, x, vec_len);
-	memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-	// Replace RHS with y431_test[i] here to recreate garbage-repeat-high-words testcase of 5/13/2012
-	for(i = 0; i < 431; i++) {
-		v[i] = y431_test[i];//rng_isaac_rand();
-	//	printf("v[%3u] = %20llu\n",i,v[i]);
-	}
-	/* vector*vector MUL to get y is here: */
-	mi64_mul_vector(u, 2, v, 431, y, &vec_len);
-	ASSERT(HERE, vec_len == 433, "GCD self-test #9: vec_len == 433");
-	clock1 = clock();
-	gcd_debug=0;
-	fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 2);
-	fail += (x[0] !=  4235226679561594903ull);
-	fail += (x[1] !=    94004235929829273ull);
-	ASSERT(HERE, fail==0,"GCD self-test #9b");
-
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf	("GCD self-test #9b passed: GCD Time =%s\n",get_time_str(tdiff));
-}
-
-	/*
-	Test #10: M80239 has the factor q = 39654784768949071.
-	This M-number needs 1254 uint64 slots to store in compact form. Create a nontrivial
-	y-vector by multiplying the factor by a number whose base-2^64 digits consist of
-	1253 randomly selected words of y4[]. Owing to
-	the special form of M-number divisors it's exceedingly unlikely that the resulting y[]
-	will have any factors in common with x[] other than q.
-	*/
-if(MAX_ARRAY_DIM >= 1254)
-{
-	printf("\nPerforming GCD self-test #10...\n");
-
-	p = 80239;	vec_len = (p>>6)+1;
-
-	// First try a gcd of 2^p-1 (huge) with just the small factor:
-	memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-	memset(u,(uint64) 0,(vec_len<<3));	u[0] = 39654784768949071ull;
-	clock1 = clock();
-	gcd_debug=0;
-	fail += (mi64_gcd(x, u, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 1);
-	fail += (x[0] !=    39654784768949071ull);
-	ASSERT(HERE, fail==0,"GCD self-test #10a");
-
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf	("GCD self-test #10a passed: GCD Time =%s\n",get_time_str(tdiff));
-
-	// Now re-init x and u and do full-length gcd:
-	memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-	for(i = 0; i < vec_len-1; i++) {
-		y[i] = rng_isaac_rand();
-	}
-	y[vec_len-1] = mi64_mul_scalar(y, 39654784768949071ull, y, vec_len-1);
-	ASSERT(HERE, y[vec_len-1] != 0, "GCD self-test #10: expected nonzero carryout of q0* y4[0:vec_len-1]!");
-
-	clock1 = clock();
-	gcd_debug=0;
-	fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 1);
-	fail += (x[0] !=    39654784768949071ull);
-	ASSERT(HERE, fail==0,"GCD self-test #10");
-
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf	("GCD self-test #10b passed: GCD Time =%s\n",get_time_str(tdiff));
-}
-
-	/*
-	Test #11: M80239 has the factor q = 39654784768949071.
-	Same as above, but in this case give the mi64_mul_vector routine a workout by
-	multiplying M80239 by a quasirandom 747-word input to get a 2001-word x-input
-	and multiplying q by the product of 2 quasirandom 1000-word inputs to get a 2001-word y.
-	*/
-gcd_debug=0;
-
-if(MAX_ARRAY_DIM >= 1254)
-{
-	printf("\nPerforming GCD self-test #11...\n");
-
-	p = 80239;	vec_len = (p>>6)+1;
-	memset(u,0xff,(vec_len<<3));	u[vec_len-1] = (1ull << (p&63)) - 1;
-	/* Multiply u by random multiword integer to get 2001-word product: */
-	for(i = 0; i < 2001-vec_len; i++) {
-		v[i] = rng_isaac_rand();
-	}
-
-	/* vector*vector MUL to get x is here: */
-	mi64_mul_vector(u, vec_len, v, 2001-vec_len, x, &vec_len);
-	ASSERT(HERE, vec_len == 2001, "GCD self-test #11: vec_len == 2001");
-
-	/* y: */
-	for(i = 0; i < 1000; i++) {
-		u[i] = rng_isaac_rand();
-		v[i] = rng_isaac_rand();
-	}
-
-	/* vector*vector MUL to get y is here: */
-	mi64_mul_vector(u, 1000, v, 1000, y, &vec_len);
-	ASSERT(HERE, vec_len == 2000, "GCD self-test #11: vec_len == 2000");
-	/* ...and in-place-multiply the result by q: */
-	y[vec_len] = mi64_mul_scalar(y, 39654784768949071ull, y, vec_len);
-	++vec_len;
-
-	/* Because of the way we constructed the test vectors they may have other small-prime
-	factors in common, so weed those out by taking a second short-length GCD, if necessary: */
-	clock1 = clock();
-	vec_len = mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0);
-	if(vec_len != 1 || x[0] != 39654784768949071ull) {
-		mi64_clear(y, vec_len);
-		y[0] = 39654784768949071ull;
-		fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 1);
-		fail += (x[0] != 39654784768949071ull);
-	}
-	ASSERT(HERE, fail==0,"GCD self-test #11");
-
-	clock2 = clock();
-	tdiff = (double)(clock2 - clock1);
-	printf	("GCD self-test #11 passed: GCD Time =%s\n",get_time_str(tdiff));
-}
-
-	/*
-	Test #12: M2202817 has the factor 87722769297534671.
-	This M-number needs 34420 uint64 slots to store in compact form. Create a nontrivial
-	y-vector by multiplying the factor by 34419 randomly selected words of y4[]. Owing to
-	the special form of M-number divisors it's exceedingly unlikely that the resulting y[]
-	will have any factors in common with x[] other than q.
-
-	04 May 2012: ~9 sec on a single core of a 2 GHz Core2 Duo.
-	23 Dec 2015: 6.2 sec	 "			"				" using new FFT-enhanced GCD.
-	*/
-#if OS_BITS == 64
-	if(MAX_ARRAY_DIM >= 34420)
-	{
-		printf("\nPerforming GCD self-test #12...\n");
-		p = 2202817;	vec_len = (p>>6)+1;
-		/*
-		2202817 = 1000011001110011000001_2
-		So the LR binary powering sequence is:
-		bit	power	action
-		---	-------	---------
-		1	1		x=2;
-		0	2		x=x^2%q;
-		0	4		x=x^2%q;
-		0	8		x=x^2%q;
-		0	16		x=x^2%q;
-		1	33		x=x^2%q; x=2*x%q;
-		1	67		x=x^2%q; x=2*x%q;
-		0	134		x=x^2%q;
-		0	268		x=x^2%q;
-		1	537		x=x^2%q; x=2*x%q;
-		1	1075	x=x^2%q; x=2*x%q;
-		1	2151	x=x^2%q; x=2*x%q;
-		0	4302	x=x^2%q;
-		0	8604	x=x^2%q;
-		1	17209	x=x^2%q; x=2*x%q;
-		1	34419	x=x^2%q; x=2*x%q;
-		0	68838	x=x^2%q;
-		0	137676	x=x^2%q;
-		0	275352	x=x^2%q;
-		0	550704	x=x^2%q;
-		0	1101408	x=x^2%q;
-		1	2202817	x=x^2%q; x=2*x%q;
-		Which allows the quotient ( = x-1 at the end of the above) to easily be computed using e.g. Pari/GP or bc.
-		*/
-		// First use this M(p) for a large-vector timing test of the mi64_div_by_scalar64 fucntion:
-		memset(u,0xff,(vec_len<<3));	u[vec_len-1] = (1ull << (p&63)) - 1;
-	
-	#if 0	// Set = 1 to do 64-bit is-divisible timing test
-		clock1 = clock();
-		for(i = 0; i < 10000; i++) {
-			ASSERT(HERE, mi64_is_div_by_scalar64_u4(u, 87722769297534671ull, vec_len), "GCD self-test #12.0");
-		}
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #12.0 passed: mi64_is_div_by_scalar64(), %u*%u words processed, Time =%s\n",i,vec_len,get_time_str(tdiff));
-	//	exit(0);
-	#endif
-	#if 0	// Set = 1 to do 64-bit div-with-remainder timing test
-		clock1 = clock();
-		for(i = 0; i < 10000; i++) {
-		//	ASSERT(HERE, mi64_div_by_scalar64   (u, 16357897499336320049ull, vec_len, v) == 14995895313315469881ull, "GCD self-test #12.0");
-		//	ASSERT(HERE, mi64_div_by_scalar64_u2(u, 16357897499336320049ull, vec_len, v) == 14995895313315469881ull, "GCD self-test #12.0");
-			ASSERT(HERE, mi64_div_by_scalar64_u4(u, 16357897499336320049ull, vec_len, v) == 14995895313315469881ull, "GCD self-test #12.0");
-		}
-		clock2 = clock();
-		ASSERT(HERE,!mi64_mul_scalar(v, 16357897499336320049ull, v, vec_len), "GCD self-test #12.1");
-		ASSERT(HERE, (v[0] += 14995895313315469881ull) && mi64_cmp_eq(u, v, vec_len), "GCD self-test #12.1");
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #12.1 passed: mi64_div_by_scalar64(), %u*%u words processed, Time =%s\n",i,vec_len,get_time_str(tdiff));
-		exit(0);
-	#endif
-	
-		// Next try a gcd of 2^p-1 (huge) with just the small factor:
-		memset(v,(uint64) 0,(vec_len<<3));	v[0] = 87722769297534671ull;
-		clock1 = clock();
-		gcd_debug=0;
-		fail += (mi64_gcd(u, v, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 1);
-		fail += (u[0] !=    87722769297534671ull);
-		ASSERT(HERE, fail==0,"GCD self-test #12a");
-	
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #12a passed: GCD Time =%s\n",get_time_str(tdiff));
-	
-		/* Now create a full-length v-vector = [large random padding integer]*[known small factor]: */
-		memset(u,0xff,(vec_len<<3));	u[vec_len-1] = (1ull << (p&63)) - 1;
-		for(i = 0; i < vec_len-1; i++) {
-			v[i] = rng_isaac_rand();
-		}
-		v[vec_len-1] = mi64_mul_scalar(v, 87722769297534671ull, v, vec_len-1);
-		ASSERT(HERE, v[vec_len-1] != 0, "GCD self-test #12: expected nonzero carryout of q0* y4[0:vec_len-1]!");
-
-	//******************** Dec 2015: Use this testcase to prototype the FFT-mul-enhanced GCD: *************************
-		clock1 = clock();
-		gcd_debug=0;
-	// Do the FFT-GCD:
-		fail  = (fft_gcd(u, v, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0) != 1);
-		fail += (u[0] !=    87722769297534671ull);
-		ASSERT(HERE, fail==0,"GCD self-test #12b");
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #12fft passed: FFT-GCD Time =%s\n",get_time_str(tdiff));
-
-	// Pure-int GCD of same inputs:
-		p = 2202817;	vec_len = (p>>6)+1;
-		memset(u,0xff,(vec_len<<3));	u[vec_len-1] = (1ull << (p&63)) - 1;
-		for(i = 0; i < vec_len-1; i++) {
-			v[i] = rng_isaac_rand();
-		}
-		v[vec_len-1] = mi64_mul_scalar(v, 87722769297534671ull, v, vec_len-1);
-		ASSERT(HERE, v[vec_len-1] != 0, "GCD self-test #12: expected nonzero carryout of q0* y4[0:vec_len-1]!");
-		clock1 = clock();
-		gcd_debug=0;
-		fail  = (mi64_gcd(u, v, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 1);
-		fail += (u[0] !=    87722769297534671ull);
-		ASSERT(HERE, fail==0,"GCD self-test #12c");
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #12int passed: INT-GCD Time =%s\n",get_time_str(tdiff));
-
-	}
-
-#endif	// OS_BITS == 64 ?
-
-	/*
-	Test #13: M22030163 has factor 795869615818103511527145535321.
-	This M-number needs 344222 uint64 slots to store in compact form. Create a nontrivial
-	y-vector by multiplying the factor by 344220 random 64-bit ints. Owing to
-	the special form of M-number divisors it's exceedingly unlikely that the resulting y[]
-	will have any factors in common with x[] other than q.
-
-	26 Dec 2015, on a single core of a 2 GHz Core2 Duo: ? sec using pure-int code, ? sec using new FFT-gcd.
-	*/
-#if OS_BITS == 64
-	if(MAX_ARRAY_DIM >= 344222)
-	{
-		printf("\nPerforming GCD self-test #13...\n");
-		p = 22030163;	vec_len = (p>>6)+1;
-		// First use this M(p) for a large-vector timing test of the mi64_div_by_scalar64 fucntion:
-		memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-	
-		// Next try a gcd of 2^p-1 (huge) with just the small factor:
-		memset(y,(uint64) 0,(vec_len<<3));	y[1] = 43144178324ull; y[0] = 4788416424359163737ull;
-		clock1 = clock();
-		gcd_debug=0;
-		fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 2);
-		fail += (x[1] != 43144178324ull || x[0] != 4788416424359163737ull);
-		ASSERT(HERE, fail==0,"GCD self-test #13a");
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #13a passed: GCD Time =%s\n",get_time_str(tdiff));
-	
-		/* Now create a full-length y-vector = [large random padding integer]*[known small factor]: */
-		memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-		y[0] = 0;
-		for(i = 0; i < vec_len-2; i++)
-		{
-			x[i] = y[i+1] = rng_isaac_rand();
-		}
-		x[vec_len-1] = x[vec_len-2] = y[vec_len-1] = 0;
-		// Now y = 2^64 * x:
-		x[vec_len-2] = mi64_mul_scalar(x, 4788416424359163737ull, x, vec_len-2);
-		ASSERT(HERE, x[vec_len-2] != 0, "GCD self-test #13b: expected nonzero carryout of q0* y4[0:vec_len-2]!");
-		y[vec_len-1] = mi64_mul_scalar_add_vec2(y, 43144178324ull, x, y, vec_len-1);
-		ASSERT(HERE, y[vec_len-1] != 0, "GCD self-test #13b: expected nonzero carryout of q1* y4[0:vec_len-1]!");
-		// Check that the 2-word factor actually divides y:
-		x[1] = 43144178324ull; x[0] = 4788416424359163737ull;
-		ASSERT(HERE, mi64_div(y, x, vec_len, 2, 0x0, 0x0), "GCD self-test #13b: divisibility test fails!");
-		// ...And re-init x to hold 2^p-1:
-		memset(x,0xff,(vec_len<<3));	x[vec_len-1] = (1ull << (p&63)) - 1;
-
-	// Do the FFT-GCD:
-		memcpy(u,x,(vec_len<<3));	memcpy(v,y,(vec_len<<3));
-		fail  = (fft_gcd(u, v, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0) != 1);
-		fail += (x[1] != 43144178324ull || x[0] != 4788416424359163737ull);
-		ASSERT(HERE, fail==0,"GCD self-test #12b");
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #13fft passed: FFT-GCD Time =%s\n",get_time_str(tdiff));
-
-	// Pure-int GCD of same inputs:
-		clock1 = clock();
-		gcd_debug=0;
-		fail += (mi64_gcd(x, y, vec_len, eGCD = FALSE, 0x0,0x0,0x0,0x0, 0,0x0,0x0,0x0,0x0,0) != 2);
-		fail += (x[1] != 43144178324ull || x[0] != 4788416424359163737ull);
-		ASSERT(HERE, fail==0,"GCD self-test #13b");
-		clock2 = clock();
-		tdiff = (double)(clock2 - clock1);
-		printf	("GCD self-test #13int passed: GCD Time =%s\n",get_time_str(tdiff));
-	}
-exit(0);
-#endif	// OS_BITS == 64 ?
-
-#if 1	/****************************************************************************************/
-	/*
-	Test #14: test the FFT-mul routines with vectors of various sizes,
-	starting with 4K 16-bit reals (= 2K 16-bit complex = 512 64-bit complex):
-	*/
-	vec_len = 512;
-	ASSERT(HERE, MAX_ARRAY_DIM >= 2*vec_len, "GCD self-test #14: MAX_ARRAY_DIM too small!");
-	ASSERT(HERE, !ARRAYS_OVERLAP(u,vec_len,v,vec_len), "GCD self-test #14: u/v-arrays overlap!");
-	ASSERT(HERE, !ARRAYS_OVERLAP(x,vec_len,y,vec_len), "GCD self-test #14: x/y-arrays overlap!");
-	while(vec_len <= MAX_ARRAY_DIM)
-	{
-		/* Init an array of (vec_len) random uint64's: */
-		mi64_rand(x,vec_len);
-		mi64_rand(y,vec_len);
-		mi64_set_eq(u,x,vec_len);
-		mi64_set_eq(v,y,vec_len);
-	printf("GCD self-test #14: vec_len = %u: x^2.lo64 = %20llu, y^2.lo64 = %20llu\n",vec_len,x[0]*x[0],y[0]*y[0]);
-		mi64_mul_vector(u, vec_len, u, vec_len, w, &lenZ);
-		ASSERT(HERE, lenZ == 2*vec_len, "GCD self-test #14: lenZ != 2*vec_len");
-		mi64_mul_vector(v, vec_len, v, vec_len, z, &lenZ);
-		ASSERT(HERE, lenZ == 2*vec_len, "GCD self-test #14: lenZ != 2*vec_len");
-	/* This was for testing repeated auto-square within an iterative loop:
-		for(i = 0; i < vec_len; i++) {
-			x[i] = y[i] = 0ull;
-		}
-		x[0] = 3; y[0] = 7;
-	*/
-		/* Init A: Convert x+I*y to packed-double form, store outputs in A: */
-		fft_len = mi64_cvt_uint64_double(x,y, 0,vec_len, a);
-		// For now just use 16 bits per double:
-		ASSERT(HERE, fft_len == 4*vec_len, "GCD self-test #14: unexpected return value of mi64_cvt_uint64_double!");
-		fft_len *= 2;	/* Multiply by 2 here because mi64_cvt_uint64_double returns *complex* FFT length
-						that results from odd/even interleaving and int64-to-real conversion of the 2 input arrays
-						*/
-		pad_len = fft_len + ( (fft_len >> DAT_BITS) << PAD_BITS );	/* Padded-array length */
-		ASSERT(HERE, !ARRAYS_OVERLAP(a,2*pad_len,b,2*pad_len), "GCD self-test #14: a/b-arrays overlap!");
-		/* Zero-pad the input vector in preparation for FFT-based multiply: */
-		memset(a+pad_len, 0, (pad_len<<3));
-	/*
-		printf("FFTmul inputs:\n");
-		for(i = 0; i < 2*fft_len; i+=2)
-		{	*** need padded-index here! ***
-			printf("[%3u]: %20.5f %20.5f\n",i>>1,a[i],a[i+1]);
-		}
-	*/
-		/* Call the init-FFT routines, using the (as yet uninited) b-array for the needed scratch space: */
-		pairFFT_mul(  b,a,0, 2*fft_len, TRUE , FALSE);	/* INIT_ARRAYS = TRUE , FORWARD_FFT_ONLY = FALSE */
-		/* Save a copy of the input vector: */
-		memcpy(b,a,(pad_len<<4));	// double the length of memcpy, thus no 2nd memset(0) step needed to 0-pad b.
-		/* Forward FFT of A-vector: */
-		pairFFT_mul(  a,0,0, 2*fft_len, FALSE, TRUE );	/* INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = TRUE  */
-	/*
-		printf("FFTmul outputs:\n");
-		for(i = 0; i < 2*fft_len; i+=2)
-		{	*** need padded-index here! ***
-			printf("[%3u]: %20.5f %20.5f\n",i>>1,a[i],a[i+1]);
-		}
-	*/
-		/* Now do the FFT-based squaring: */
-		pairFFT_mul(  b,a,0, 2*fft_len, FALSE, FALSE);	/* INIT_ARRAYS = FALSE, FORWARD_FFT_ONLY = FALSE */
-
-		/* Convert result back to uint64[] form and compare 2 outputs to all-integer squarings.
-
-			NOTE: u,v,x,y each only alloc'ed to MAX_ARRAY_DIM elts, thus not large enough to store
-		a full double-width product. But since we only need two pointers for the back-converted FFT outputs,
-		we use the 2*MAX_ARRAY_DIM-long alloc for u+v (accessed via u-ptr) for Re part of the FFT output,
-		and similarly use x+y (accessed via x-ptr) for the Im part of the FFT output:
-		*/
-		ASSERT(HERE, mi64_cvt_double_uint64(b, fft_len, u,x) == 0x0, "GCD self-test #14: nonzero carryout from mi64_cvt_double_uint64!");
-
-	//	ASSERT(HERE, mi64_cmp_eq(u,w, 2*vec_len), "GCD self-test #14: Re part of the FFT output mismatches reference!");
-		for(i = 0; i < 2*vec_len; i++) {
-			ASSERT(HERE, u[i] == w[i], "GCD self-test #14: Re part of the FFT output mismatches reference!");
-		}
-
-	//	ASSERT(HERE, mi64_cmp_eq(x,z, 2*vec_len), "GCD self-test #14: Im part of the FFT output mismatches reference!");
-		for(i = 0; i < 2*vec_len; i++) {
-			ASSERT(HERE, x[i] == z[i], "GCD self-test #14: Im part of the FFT output mismatches reference!");
-		}
-
-		vec_len *= 2;
-
-		if(vec_len > MAX_ARRAY_DIM)
-			break;
-	}
-
-	/* Now turn the pairwise-linear-combination version of the genFFT_mul algo loose on some actual GCD vector pairs.
-	Here is the basic idea:
-
-		Using the old quadratic 64-bit-int version of the code (as captured in the mi64_gcd function), we compute
-	GCD(u,v) by reducing both vectors u,v ~64 bits per iteration. To do this we use the leading 128 bits of u,v to
-	construct 2 pairs of 64-bit multipliers [a,b] and [c,d] such that u' = a*u - b*v and v' = c*u - d*v are just
-	64 bits each. Each of the 4 subproducts a*u, b*v, c*u and d*v is 192 bits, thus the multipliers are chosen such
-	that the leading 128 bits of the pairs [a*u, b*v] and [c*u, d*v] match and cancel upon subtraction.
-
-	Example: base = 2^64, 128-bit inputs:
-		u = base*15243067522374418434 +  8684254506156651683;
-		v = base* 2503926509447010181 + 14921004030550884877;
-
-		a =    29965985744110959; b =   182422903527176837;
-		c =   900835843694674153; d =  5483987465369195218;
-
-a*u = base^2* 24761743440837882 + base*18356059860786658291 + 4126082876808790445
-b*v = base^2* 24761743440837882 + base*18356059860786658288 + 8228777268749009601
-a*u-b*v = 51237537829188435692, 65 bits.
-
-c*u = base^2* 744386192877435340 + base*12608466216960890642 + 10700841904888342619
-d*v = base^2* 744386192877435340 + base*12608466216960890642 + 11791441100596549802
-c*u-d*v = -1090599195708207183, 64-bit.
-
-But note that if we use only the leading 64-bit word of u,v in the 2 linear combinations, we get
-
-a*u.hi64 = base^2*  24761743440837882 + base*18341952642423900894
-b*v.hi64 = base^2*  24761743440837882 + base*18208503583759260185
-
-c*u.hi64 = base^2* 744386192877435340 + base*12184375781074026962
-d*v.hi64 = base^2* 744386192877435340 + base* 8172637392790005018 ,
-
-and only the leading ~64 bits of the resulting subproducts cancel upon subtraction, leaving 64 bits
-plus the unprocessed low words. This would indicate that we must apply the N-bit [a,b,c,d] multipliers
-to the same leading 2N bits of the inputs u,v which were used to compute [a,b,c,d], but note what we
-get by applying the same multipliers to low halves of u,v:
-
-a*u.lo64 =                              base*   14107218362757397 +  4126082876808790445
-b*v.lo64 =                              base*  147556277027398103 +  8228777268749009601
-
-c*u.lo64 =                              base*  424090435886863680 + 10700841904888342619
-d*v.lo64 =                              base* 4435828824170885624 + 11791441100596549802 .
-
-Now add the base* coefficients in the hi64-products to those in the lo64-products:
-
-18341952642423900894 +   14107218362757397 = 18356059860786658291	<*** low 2 bits
-18208503583759260185 +  147556277027398103 = 18356059860786658288	<*** mismatch!
-
-12184375781074026962 +  424090435886863680 = 12608466216960890642
- 8172637392790005018 + 4435828824170885624 = 12608466216960890642 ,
-
-and we see that except for the low 2 bits of the first case, the results match. (And if we had simply
-included a few 'guard bits' into our computation, we could guarantee with a high probability that all
-the results would match perfectly.) Thus, with appropriate bitcounts in our [a,b,c,d] multipliers and
-the chunks of u,v to which they are applied, we *know* that the high 2N bits of the full length-3N
-[a,b,c,d]*[lead 2N bits of u,v] linear combos will match, thus we only need to explicitly compute the
-low N bits, via a single length-2N FFT-mul to get the length-2N linear combos [a*u.loN - b*v.loN] and
-[c*u.loN - d*v.loN] and discarding of the high N-bit halves of the results.
-
-x=d*v; hi = x>>64
-lo = x-(hi<<64)
-******* See if can actually get low-halves-only-FFT-mul scheme working, then modify appropriately: ********
-For large u,v, inputs in order to take advantage of fast FFT-mul, we run mi64_gcd in eGCD mode to compute
-(say) 8192-bit [a,b,c,d] multipliers at a time, using the leading 16384 bits of u,v. We then use FFT-mul
-to compute our linear combos, in 2 stages:
-
-[1] FFT-mul to compute [a*u.lo8192 - b*v.lo8192] and [c*u.lo8192 - d*v.lo8192], each 16384 bits,
-where lo8192 refers to the lower halves of the leading 16384-bits of u,v used to compute the multipliers.
-
-[2] FFT-mul to compute [a*u.hi8192 - b*v.hi8192] and [c*u.hi8192 - d*v.hi8192], each 16384 bits,
-where hi8192 refer to the upper halves of the leading 16384-bits of u,v used to compute the multipliers.
-We then add the upper 8192 bits of the 2 lincombos computed in [1] to the low 8192 bits of the high-part
-lincombos and propagate carries, at which point we should find all but perhaps the lowest few of the high
-16384 bits of the full length-3*N = 3*8192 = 24576-bit lincombos cancelling to 0.
-
-Cost of the FFT-muls in [1],[2]:
-
-[A] Cost of generating the 8192 mults [a,b,c,d] using pure-int eGCD and 64-bit-at-a-time reduction: O((N/64)^2);
-[B] Length-2*N complex FFT to compute length-2*N fFFT(u.lo8192, v.lo8192);
-[C] Length-2*N complex FFT to compute length-2*N fFFT(u.hi8192, v.hi8192);
-[D] Length-2*N complex FFT to compute length-2*N fFFT(a,b);
-[E] Length-2*N complex FFT to compute length-2*N fFFT(c,d);
-[F] O(N) dyad-mul and Length-2*N complex iFFT to compute length-2*N lincombos in [1];
-[G] O(N) dyad-mul and Length-2*N complex iFFT to compute length-2*N lincombos in [2];
-[H} O(N) add/carry step to combine the 2 length-2*N vector pairs (via overlap of middle N bits).
-
-Thus asymptotically the cost is that of 6 length-2*N complex FFTs.
-	*/
-// FFTMUL_THRESHOLD_BITS = #leading bits of u,v to use to generate FFT-multipliers.
-// This corresponds to 2N in above commentary, thus said multipliers are half as long.
-
-#endif	/****************************************************************************************/
-
-	return fail;
-}
-
-#undef YES_ASM