diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
index f165597e..3f01875e 100644
--- a/.github/workflows/continuous-integration.yml
+++ b/.github/workflows/continuous-integration.yml
@@ -21,11 +21,11 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        container: ["alicevision/popsift-deps:cuda10.2-ubuntu18.04", "alicevision/popsift-deps:cuda11.8.0-ubuntu20.04", "alicevision/popsift-deps:cuda12.1.0-ubuntu22.04"]
+        container: ["alicevision/popsift-deps:cuda11.8.0-ubuntu20.04", "alicevision/popsift-deps:cuda12.1.0-ubuntu22.04"]
         build_tpe: ["Release", "Debug"]
         exclude:
         # excludes debug on this one as it has a segmentation fault during the compilation (!)
-        - container: "alicevision/popsift-deps:cuda12.1.0-ubuntu22.04"
+        - container: "alicevision/popsift-deps:cuda11.8.0-ubuntu20.04"
           build_tpe: "Debug"
 
     container:
diff --git a/.travis.yml b/.travis.yml
index dd24a21b..030cb775 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,4 @@
-dist: xenial
+dist: jammy
 
 language: cpp
 compiler: gcc
@@ -13,10 +13,8 @@ addons:
 
 env:
   matrix:
-    - CUDA_VERSION_MAJOR="8" CUDA_VERSION_MINOR="0" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.61-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
-    - CUDA_VERSION_MAJOR="9" CUDA_VERSION_MINOR="2" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.148-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
-    - CUDA_VERSION_MAJOR="10" CUDA_VERSION_MINOR="2" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.89-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
-    - CUDA_VERSION_MAJOR="11" CUDA_VERSION_MINOR="0" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.2-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
+    - CUDA_VERSION_MAJOR="11" CUDA_VERSION_MINOR="8" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.0-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
+    - CUDA_VERSION_MAJOR="12" CUDA_VERSION_MINOR="5" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.1-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
 
 
   global:
@@ -34,7 +32,7 @@ env:
     - POPSIFT_APP_INSTALL_RELEASE=${POPSIFT_APP_BUILD_RELEASE}/install
     - POPSIFT_APP_INSTALL_DEBUG=${POPSIFT_APP_BUILD_DEBUG}/install
     # CMAKE
-    - CMAKE_URL="https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.tar.gz"
+    - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.tar.gz"
     - CMAKE_ROOT=${TRAVIS_BUILD_DIR}/cmake
     - CMAKE_SOURCE=${CMAKE_ROOT}/source
     - CMAKE_INSTALL=${CMAKE_ROOT}/install
@@ -57,30 +55,12 @@ before_install:
     fi
 
 install:
-  - UBUNTU_VERSION=ubuntu1604
-  - >
-    if [ ${CUDA_VERSION_MAJOR} -lt 11 ]; then
-        CUDA_REPO_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA_PKG_LONGVERSION}_amd64.deb
-        wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/$CUDA_REPO_PKG
-        travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
-        sudo dpkg -i $CUDA_REPO_PKG
-        rm ${CUDA_REPO_PKG}
-        travis_retry sudo apt-get -y update
-        # cuda > 10.0 changed cublas naming
-        if [ ${CUDA_VERSION_MAJOR} -lt 10 ]; then
-              CUBLAS_PKG=cuda-cublas-dev-$CUDA_PKG_VERSION
-        else
-              CUBLAS_PKG=libcublas-dev
-        fi
-        travis_retry sudo apt-get install -y --no-install-recommends --allow-unauthenticated cuda-core-$CUDA_PKG_VERSION  cuda-cudart-dev-$CUDA_PKG_VERSION  ${CUBLAS_PKG} cuda-curand-dev-$CUDA_PKG_VERSION
-        sudo ln -s /usr/local/cuda-${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} /usr/local/cuda
-    else
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-ubuntu1604.pin
-        travis_retry sudo mv cuda-ubuntu1604.pin /etc/apt/preferences.d/cuda-repository-pin-600
-        travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
-        travis_retry sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/ /"
-        sudo apt-get update && sudo apt-get -y install cuda
-    fi
+  - UBUNTU_VERSION=ubuntu2204
+  - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/cuda-${UBUNTU_VERSION}.pin
+  - travis_retry sudo mv cuda-${UBUNTU_VERSION}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+  - travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/3bf863cc.pub
+  - travis_retry sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/ /"
+  - sudo apt-get update && sudo apt-get -y install cuda
 #  - CUDA_REPO_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA_PKG_LONGVERSION}_amd64.deb
 #  - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/$CUDA_REPO_PKG
 
@@ -131,3 +111,4 @@ cache:
   apt: true
   directories:
     - ${CMAKE_INSTALL}
+
diff --git a/CHANGES.md b/CHANGES.md
index 9608b25d..1cfc0b51 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [1.0.0] - YYYY-MM-DD
 
+## 2024
+
+- CMake: CUDA as first-order language, different CC selection
+
 ### Added
 - Improved checks for CUDA textures [PR](https://github.com/alicevision/popsift/pull/89)
 - CMake: Improved support for all Cuda CC [PR](https://github.com/alicevision/popsift/pull/75)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f39f2fec..a9c033cb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,16 @@
-# CMake below 3.4 does not work with CUDA separable compilation at all
-cmake_minimum_required(VERSION 3.12)
+# First-order language CUDA requires at least CMake 3.18
+cmake_minimum_required(VERSION 3.24)
 
-project(PopSift VERSION 1.0.0 LANGUAGES CXX)
+project(PopSift VERSION 1.0.0 LANGUAGES CXX CUDA)
+
+# Policy to support CUDA as a first-order language for CMake.
+# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
+cmake_policy(SET CMP0104 NEW)
+
+set(CMAKE_CUDA_ARCHITECTURES "all-major"
+    CACHE
+    STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons"
+    FORCE)
 
 # Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
@@ -16,7 +25,6 @@ option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce co
 option(PopSift_USE_NORMF "The __normf function computes Euclidean distance on large arrays. Fast but stability is uncertain." OFF)
 option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF)
 option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
-option(PopSift_NO_DEPRECATED_CUDA_SM_WARNINGS "Suppress warnings about soon to be deprecated cuda SM" ON)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 
 if(PopSift_USE_POSITION_INDEPENDENT_CODE AND NOT MSVC)
@@ -55,17 +63,15 @@ include(GNUInstallDirs)
 
 if(BUILD_SHARED_LIBS)
   message(STATUS "BUILD_SHARED_LIBS ON")
-  # Need to declare CUDA_USE_STATIC_CUDA_RUNTIME as an option to ensure that it is not overwritten in FindCUDA.
-  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" OFF)
-  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
-  # Workaround to force deactivation of cuda static runtime for cmake < 3.10
-  set(CUDA_cudart_static_LIBRARY 0)
+
   # Auto-build dll exports on Windows
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
 else()
   message(STATUS "BUILD_SHARED_LIBS OFF")
-  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" ON)
-  set(CUDA_USE_STATIC_CUDA_RUNTIME ON)
+
+  set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
 endif()
 
 # Require threads because of std::thread.
@@ -74,26 +80,15 @@ find_package(Threads REQUIRED)
 ###################
 #  CUDA
 ###################
-find_package(CUDA 7.0 REQUIRED)
+include(CheckLanguage)
+check_language(CUDA)
 
-if(NOT CUDA_FOUND)
-  message(FATAL_ERROR "Could not find CUDA >= 7.0")
-endif()
+# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
+# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
+find_package(CUDAToolkit)
 
-message(STATUS "CUDA Version is ${CUDA_VERSION}")
-
-include(ChooseCudaCC)
-if(NOT DEFINED PopSift_CUDA_CC_LIST)
-  chooseCudaCC(PopSift_CUDA_CC_LIST_BASIC
-               PopSift_CUDA_GENCODE_FLAGS
-               MIN_CC 30
-               MIN_CUDA_VERSION 7.0)
-  set(PopSift_CUDA_CC_LIST ${PopSift_CUDA_CC_LIST_BASIC} CACHE STRING "CUDA CC versions to compile")
-else()
-  getFlagsForCudaCCList(PopSift_CUDA_CC_LIST
-                        PopSift_CUDA_GENCODE_FLAGS)
-endif()
-list(APPEND CUDA_NVCC_FLAGS "${PopSift_CUDA_GENCODE_FLAGS}")
+message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
+set(CUDA_VERSION ${CUDAToolkit_VERSION})
 
 if(PopSift_USE_NVTX_PROFILING)
   message(STATUS "PROFILING CPU CODE: NVTX is in use")
@@ -104,46 +99,8 @@ if(PopSift_ERRCHK_AFTER_KERNEL)
   list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
 endif()
 
-set(CUDA_SEPARABLE_COMPILATION ON)
-
-if(PopSift_NO_DEPRECATED_CUDA_SM_WARNINGS)
-  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-endif()
-
-if(UNIX AND NOT APPLE)
-  list(APPEND CUDA_NVCC_FLAGS         "-Xcompiler;-rdynamic")
-  # set(CUDA_NVCC_FLAGS       "${CUDA_NVCC_FLAGS};-Xptxas;-v")
-  # set(CUDA_NVCC_FLAGS       "${CUDA_NVCC_FLAGS};-Xptxas;-warn-double-usage")
-  list(APPEND CUDA_NVCC_FLAGS_DEBUG   "--keep")
-  list(APPEND CUDA_NVCC_FLAGS_DEBUG   "--source-in-ptx")
-endif()
-
-# The following if should not be necessary, but apparently there is a bug in FindCUDA.cmake that
-# generate an empty string in the nvcc command line causing the compilation to fail.
-# see https://gitlab.kitware.com/cmake/cmake/issues/16411
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  message(STATUS "Building in debug mode")
-  list(APPEND CUDA_NVCC_FLAGS_DEBUG   "-G")
-endif()
-list(APPEND CUDA_NVCC_FLAGS_RELEASE "-O3")
-
-if(PopSift_USE_POSITION_INDEPENDENT_CODE AND NOT MSVC)
-  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler;-fPIC")
-endif()
-
-# this is to ensure that on MSVC the flags for the linker are properly propagate even to the intermediate
-# linking step. This seems not the case e.g. on vcpkg using ninja build.
-if(MSVC)
-  if(BUILD_SHARED_LIBS)
-    set(PopSift_MVSC_LINKER "/MD")
-  else()
-    set(PopSift_MVSC_LINKER "/MT")
-  endif()
-  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-      set(PopSift_MVSC_LINKER "${PopSift_MVSC_LINKER}d")
-  endif()
-  list(APPEND CUDA_NVCC_FLAGS   -Xcompiler ${PopSift_MVSC_LINKER})
-endif()
+# This may not be required any more.
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
 
 # default stream per-thread implies that each host thread has one non-synchronizing 0-stream
 # currently, the code requires legacy mode
@@ -159,7 +116,7 @@ if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
   endif()
 endif()
 
-set(PopSift_CXX_STANDARD 14) # Thrust/CUB requires C++14 starting with CUDA SDK 11
+set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
 if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
   set(PopSift_CXX_STANDARD 11)
 endif()
@@ -181,9 +138,9 @@ else()
 endif()
 
 if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
-  set(HAVE_SHFL_DOWN_SYNC   1)
+  set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
 else()
-  set(HAVE_SHFL_DOWN_SYNC   0)
+  set(PopSift_HAVE_SHFL_DOWN_SYNC   0)
 endif()
 
 if(NOT PopSift_USE_GRID_FILTER)
@@ -193,12 +150,8 @@ else()
   set(DISABLE_GRID_FILTER   0)
 endif()
 
-# library required for CUDA dynamic parallelism, forgotten by CMake 3.4
-cuda_find_library_local_first(CUDA_CUDADEVRT_LIBRARY cudadevrt "\"cudadevrt\" library")
-
 if(PopSift_USE_NVTX_PROFILING)
   # library required for NVTX profiling of the CPU
-  cuda_find_library_local_first(CUDA_NVTX_LIBRARY nvToolsExt "NVTX library")
   set(PopSift_USE_NVTX 1)
 else()
   set(PopSift_USE_NVTX 0)
@@ -245,9 +198,14 @@ message(STATUS "Use CUDA NVTX for profiling: " ${PopSift_USE_NVTX_PROFILING})
 message(STATUS "Synchronize and check CUDA error after every kernel: " ${PopSift_ERRCHK_AFTER_KERNEL})
 message(STATUS "Grid filtering: " ${PopSift_USE_GRID_FILTER})
 message(STATUS "Additional warning for CUDA nvcc: " ${PopSift_NVCC_WARNINGS})
-message(STATUS "Compiling for CUDA CCs: ${PopSift_CUDA_CC_LIST}")
 message(STATUS "Install path: " ${CMAKE_INSTALL_PREFIX})
 message(STATUS "Testing step: " ${PopSift_USE_TEST_CMD})
+
+message(STATUS "CMAKE_CUDA_COMPILER = ${CMAKE_CUDA_COMPILER}")
+message(STATUS "CMAKE_CUDA_COMPILER_ID = ${CMAKE_CUDA_COMPILER_ID}")
+message(STATUS "CMAKE_CUDA_COMPILER_VERSION = ${CMAKE_CUDA_COMPILER_VERSION}")
+message(STATUS "CMAKE_CUDA_ARCHITECTURES = ${CMAKE_CUDA_ARCHITECTURES}")
+
 if(PopSift_USE_TEST_CMD)
   message(STATUS "Path for test input: " ${PopSift_TESTFILE_PATH})
 endif()
diff --git a/README.md b/README.md
index 418d1278..738794c9 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ PopSift depends on:
 
 Optionally, for the provided applications:
 
-* Boost >= 1.55 (required components {atomic, chrono, date-time, system, thread}-dev)
+* Boost >= 1.71 (required components {atomic, chrono, date-time, system, thread}-dev)
 
 * DevIL (libdevil-dev) can be used to load a broader range of image formats, otherwise only pgm is supported.
 
diff --git a/appveyor.yml b/appveyor.yml
index dc822f2b..679ac5b5 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,6 +1,9 @@
+#
+# Build system for the PopSift library, including its demo programs.
+#
 version: '1.0.{build}'
 
-image: Visual Studio 2015
+image: Visual Studio 2022
 
 platform:
   - x64
@@ -14,6 +17,9 @@ configuration:
 #    - DBUILD_SHARED_LIBS: 0
 #    - DBUILD_SHARED_LIBS: 1
 
+#
+# Check the separate file cudaInstallAppveyor for the installation of CUDA
+#
 install:
     - cmd: >-
           call cudaInstallAppveyor.cmd
@@ -23,10 +29,22 @@ install:
           --triplet %PLATFORM%-windows
           # devil
 
+#
+# When updating to a new version of visual studio, change the generation string after
+# -G and find the suitable toolkit version that is listed after -T (v143 in this case).
+# The CUDA Toolkit and the VS version must match. The matches are found in the CUDA
+# documentation.
+# The platform in this case is x64. Apparently, you need in after -T for VS and after -A
+# for CUDA.
+# You can only have one -T parameter, but you can separate several options with a comma.
+#
+# PopSift_USE_GRID_FILTER is off in this build because the installation of CUDA Thrust
+# in cudaInstallAppveyor is not happening yet.
+#
 before_build:
     - md build
     - cd build
-    - cmake -G "Visual Studio 14 2015" -A x64 -T v140,host=x64 -DBUILD_SHARED_LIBS=%DBUILD_SHARED_LIBS% -DPopSift_BUILD_DOCS:BOOL=OFF -DPopSift_USE_POSITION_INDEPENDENT_CODE:BOOL=%DBUILD_SHARED_LIBS% -DPopSift_BUILD_EXAMPLES:BOOL=ON -DCMAKE_BUILD_TYPE=%configuration% -DCMAKE_TOOLCHAIN_FILE=c:/tools/vcpkg/scripts/buildsystems/vcpkg.cmake ..
+    - cmake -G "Visual Studio 17 2022" -A x64 -T v143,host=x64,cuda="%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" -DBUILD_SHARED_LIBS:BOOL=ON -DPopSift_USE_NVTX_PROFILING:BOOL=OFF -DPopSift_USE_GRID_FILTER:BOOL=OFF -DPopSift_BUILD_DOCS:BOOL=OFF -DPopSift_USE_POSITION_INDEPENDENT_CODE:BOOL=ON -DPopSift_BUILD_EXAMPLES:BOOL=ON -DCMAKE_BUILD_TYPE=%configuration% -DCMAKE_TOOLCHAIN_FILE=c:/tools/vcpkg/scripts/buildsystems/vcpkg.cmake ..
     - ls -l
 
 build:
diff --git a/cmake/ChooseCudaCC.cmake b/cmake/ChooseCudaCC.cmake
deleted file mode 100755
index aba4eb91..00000000
--- a/cmake/ChooseCudaCC.cmake
+++ /dev/null
@@ -1,184 +0,0 @@
-#
-# This file contains two functions:
-# chooseCudaCC
-# getFlagsForCudaCCList
-#
-# Motivation:
-# CUDA hardware and SDKs are developing over time, different SDK support different
-# hardware, and supported hardware differs depending on platform even for the same
-# SDK version. This file attempts to provide a function that returns a valid selection
-# of hardware for the current SDK and platform. It will require updates as CUDA develops,
-# and it is currently not complete in terms of existing platforms that support CUDA.
-#
-
-#
-# Return the minimal set of supported Cuda CC 
-#
-# Usage:
-#   chooseCudaCC(SUPPORTED_CC SUPPORTED_GENCODE_FLAGS
-#                [MIN_CUDA_VERSION X.Y]
-#                [MIN_CC XX ])
-#
-# SUPPORTED_CC out variable. Stores the list of supported CC.
-# SUPPORTED_GENCODE_FLAGS out variable. List of gencode flags to append to, e.g., CUDA_NVCC_FLAGS
-# MIN_CUDA_VERSION the minimal supported version of cuda (e.g. 7.5, default 7.0).
-# MIN_CC minimal supported Cuda CC by the project (e.g. 35, default 20)
-#
-# This function does not edit cache entries or variables in the parent scope
-# except for the variables whose names are supplied for SUPPORTED_CC and
-# SUPPORTED_GENCODE_FLAGS
-#
-# You may want to cache SUPPORTED_CC and append SUPPORTED_GENCODE_FLAGS to
-# CUDA_NVCC_FLAGS.
-# Like this:
-#    set(MYCC ${MYCC} CACHE STRING "CUDA CC versions to compile")
-# end
-#    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${MY_GENCODE_FLAGS}")
-#    
-function(chooseCudaCC SUPPORTED_CC SUPPORTED_GENCODE_FLAGS)
-  set(options "")
-  set(oneValueArgs MIN_CUDA_VERSION MIN_CC)
-  set(multipleValueArgs "")
-  cmake_parse_arguments(CHOOSE_CUDA "${options}" "${oneValueArgs}" "${multipleValueArgs}" ${ARGN})
-
-  if(NOT DEFINED CHOOSE_CUDA_MIN_CC)
-    set(CHOOSE_CUDA_MIN_CC 20)
-  endif()
-  if(NOT DEFINED CHOOSE_CUDA_MIN_CUDA_VERSION)
-    set(CHOOSE_CUDA_MIN_CUDA_VERSION 7.0)
-  endif()
-
-  find_package(CUDA ${CHOOSE_CUDA_MIN_CUDA_VERSION} REQUIRED)
-
-  if(NOT CUDA_FOUND)
-    message(FATAL_ERROR "Could not find CUDA >= ${CHOOSE_CUDA_MIN_CUDA_VERSION}")
-  endif()
-
-  #
-  # Create a list of possible CCs for each host processor.
-  # This may require tuning: CUDA cards exist in AIX machines with POWER CPUs,
-  # it is possible that non-Tegra ARM systems exist as well.
-  # For now, this is my best guess.
-  #
-  set(TEGRA_SUPPORTED_PROCESSORS "armv71;arm;aarch64")
-  set(OTHER_SUPPORTED_PROCESSORS "i686;x86_64;AMD64")
-
-  set(CC_LIST_BY_SYSTEM_PROCESSOR "")
-  if(CMAKE_SYSTEM_PROCESSOR IN_LIST OTHER_SUPPORTED_PROCESSORS)
-    list(APPEND CC_LIST_BY_SYSTEM_PROCESSOR "20;21;30;35;50;52;60;61;70;75;80;86;89;90")
-  endif()
-  if(CMAKE_SYSTEM_PROCESSOR IN_LIST TEGRA_SUPPORTED_PROCESSORS)
-    list(APPEND CC_LIST_BY_SYSTEM_PROCESSOR "32;53;62;72")
-  endif()
-  if(NOT CC_LIST_BY_SYSTEM_PROCESSOR)
-    message(FATAL_ERROR "Unknown how to build for ${CMAKE_SYSTEM_PROCESSOR}")
-  endif()
-
-  #
-  # Default setting of the CUDA CC versions to compile.
-  # Shortening the lists saves a lot of compile time.
-  #
-
-  # The current version last time this list was updated was CUDA 12.1.
-  if(CUDA_VERSION VERSION_GREATER_EQUAL 12)
-    set(CUDA_MIN_CC 50)
-    set(CUDA_MAX_CC 90)
-  elseif(CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-    set(CUDA_MIN_CC 35)
-    set(CUDA_MAX_CC 90)
-  elseif(CUDA_VERSION VERSION_GREATER_EQUAL 11.1)
-    set(CUDA_MIN_CC 35)
-    set(CUDA_MAX_CC 86)
-  elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 11)
-    set(CUDA_MIN_CC 35)
-    set(CUDA_MAX_CC 80)
-  elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 10)
-    set(CUDA_MIN_CC 30)
-    set(CUDA_MAX_CC 75)
-  elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
-    set(CUDA_MIN_CC 30)
-    set(CUDA_MAX_CC 72)
-  elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 8)
-    set(CUDA_MIN_CC 20)
-    set(CUDA_MAX_CC 62)
-  elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 7)
-    set(CUDA_MIN_CC 20)
-    set(CUDA_MAX_CC 53)
-  else()
-    message(FATAL_ERROR "We do not support a CUDA SDK below version 7.0")
-  endif()
-  if(${CHOOSE_CUDA_MIN_CC} GREATER ${CUDA_MIN_CC})
-    set(CUDA_MIN_CC ${CHOOSE_CUDA_MIN_CC})
-  endif()
-
-  set(CC_LIST "")
-  foreach(CC ${CC_LIST_BY_SYSTEM_PROCESSOR})
-    if( (${CC} GREATER_EQUAL ${CUDA_MIN_CC}) AND
-        (${CC} LESS_EQUAL ${CUDA_MAX_CC}) )
-      list(APPEND CC_LIST ${CC})
-    endif()
-  endforeach()
-
-  #
-  # Add all requested CUDA CCs to the command line for offline compilation
-  #
-  set(GENCODE_FLAGS "")
-  list(SORT CC_LIST)
-  foreach(CC_VERSION ${CC_LIST})
-    list(APPEND GENCODE_FLAGS "-gencode;arch=compute_${CC_VERSION},code=sm_${CC_VERSION}")
-  endforeach()
-
-  #
-  # Use the highest request CUDA CC for CUDA JIT compilation
-  #
-  list(LENGTH CC_LIST CC_LIST_LEN)
-  MATH(EXPR CC_LIST_LEN "${CC_LIST_LEN}-1")
-  list(GET CC_LIST ${CC_LIST_LEN} CC_LIST_LAST)
-  list(APPEND GENCODE_FLAGS "-gencode;arch=compute_${CC_LIST_LAST},code=compute_${CC_LIST_LAST}")
-
-  #
-  # Two variables are exported to the parent scope. One is passed through the
-  # environment (CUDA_NVCC_FLAGS), the other is passed by name (SUPPORTED_CC)
-  #
-  set(${SUPPORTED_GENCODE_FLAGS} "${GENCODE_FLAGS}" PARENT_SCOPE)
-  set(${SUPPORTED_CC} "${CC_LIST}" PARENT_SCOPE)
-endfunction()
-
-#
-# Return the gencode parameters for a given list of CCs.
-#
-# Usage:
-#   getFlagsForCudaCCList(INPUT_CC_LIST SUPPORTED_GENCODE_FLAGS)
-#
-# INPUT_CC_LIST in variable. Contains a list of supported CCs.
-# SUPPORTED_GENCODE_FLAGS out variable. List of gencode flags to append to, e.g., CUDA_NVCC_FLAGS
-#
-function(getFlagsForCudaCCList INPUT_CC_LIST SUPPORTED_GENCODE_FLAGS)
-  set(CC_LIST "${${INPUT_CC_LIST}}")
-
-  #
-  # Add all requested CUDA CCs to the command line for offline compilation
-  #
-  set(GENCODE_FLAGS "")
-  list(SORT CC_LIST)
-  foreach(CC_VERSION ${CC_LIST})
-    list(APPEND GENCODE_FLAGS "-gencode;arch=compute_${CC_VERSION},code=sm_${CC_VERSION}")
-  endforeach()
-
-  #
-  # Use the highest request CUDA CC for CUDA JIT compilation
-  #
-  list(LENGTH CC_LIST CC_LIST_LEN)
-  MATH(EXPR CC_LIST_LEN "${CC_LIST_LEN}-1")
-  list(GET CC_LIST ${CC_LIST_LEN} CC_LIST_LAST)
-  list(APPEND GENCODE_FLAGS "-gencode;arch=compute_${CC_LIST_LAST},code=compute_${CC_LIST_LAST}")
-
-  message(STATUS "Setting gencode flags: ${GENCODE_FLAGS}")
-
-  #
-  # Two variables are exported to the parent scope. One is passed through the
-  # environment (CUDA_NVCC_FLAGS), the other is passed by name (SUPPORTED_CC)
-  #
-  set(${SUPPORTED_GENCODE_FLAGS} "${GENCODE_FLAGS}" PARENT_SCOPE)
-endfunction()
-
diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
index 30eea599..b8fce13a 100644
--- a/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@@ -40,6 +40,7 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(Threads REQUIRED)
+find_dependency(CUDAToolkit REQUIRED)
 
 include("${CMAKE_CURRENT_LIST_DIR}/@popsift_targets_export_name@.cmake")
 check_required_components("@PROJECT_NAME@")
diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in
index 427cfe42..86095a55 100644
--- a/cmake/sift_config.h.in
+++ b/cmake/sift_config.h.in
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#define POPSIFT_IS_DEFINED(F) F() == 1
-
-#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @HAVE_SHFL_DOWN_SYNC@
-#define POPSIFT_HAVE_NORMF()          @PopSift_HAVE_NORMF@
-#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@
-#define POPSIFT_USE_NVTX()            @PopSift_USE_NVTX@
+#define POPSIFT_IS_DEFINED(F)   F() == 1
+#define POPSIFT_IS_UNDEFINED(F) F() == 0
 
+#define POPSIFT_HAVE_SHFL_DOWN_SYNC()     @PopSift_HAVE_SHFL_DOWN_SYNC@
+#define POPSIFT_HAVE_NORMF()              @PopSift_HAVE_NORMF@
+#define POPSIFT_DISABLE_GRID_FILTER()     @DISABLE_GRID_FILTER@
+#define POPSIFT_USE_NVTX()                @PopSift_USE_NVTX@
 
diff --git a/cudaInstallAppveyor.cmd b/cudaInstallAppveyor.cmd
index 381d4048..9d43f7fa 100644
--- a/cudaInstallAppveyor.cmd
+++ b/cudaInstallAppveyor.cmd
@@ -1,19 +1,44 @@
 @echo off
-echo Downloading CUDA toolkit 9
-appveyor DownloadFile  https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/cuda_9.1.85_windows -FileName cuda_9.1.85_windows.exe
+echo Downloading CUDA toolkit 12 for Windows 10
+# appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.5.1/network_installers/cuda_12.5.1_windows_network.exe -Filename cuda_12.5.1_windows.exe
+
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.5.82-archive.zip -Filename cuda_nvcc.zip
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.5.82-archive.zip -Filename cuda_cudart.zip
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.5.82-archive.zip -Filename cuda_nvtx.zip
+appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.5.82-archive.zip -Filename vs_integration.zip
+dir
+
+echo Unzipping CUDA toolkit 12
+tar -xf cuda_nvcc.zip
+tar -xf cuda_cudart.zip
+tar -xf cuda_nvtx.zip
+tar -xf vs_integration.zip
 dir
-echo Installing CUDA toolkit 9
-cuda_9.1.85_windows.exe -s nvcc_9.1 ^
-                           cublas_9.1 ^
-                           cublas_dev_9.1 ^
-                           cudart_9.1 ^
-                           curand_9.1 ^
-                           curand_dev_9.1
 
-echo CUDA toolkit 9 installed
+echo Making CUDA install dir(s)
+mkdir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5"
+mkdir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras"
+
+echo Copying toolkit files to install dir(s)
+xcopy cuda_cudart-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y
+xcopy cuda_nvcc-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y
+xcopy cuda_nvtx-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y
+xcopy visual_studio_integration-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras" /s /e /i /y
+
+
+# echo Installing CUDA toolkit 12
+# cuda_12.5.1_windows.exe
+# cuda_9.1.85_windows.exe -s nvcc_12.5 cudart_12.5
+
+
+echo CUDA toolkit 12 installed
 
 dir "%ProgramFiles%"
 
-set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.1\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.1\libnvvp;%PATH%
+set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\libnvvp;%PATH%
+
+dir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA"
+dir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5"
+dir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\bin"
 
-nvcc -V
\ No newline at end of file
+nvcc -V
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0380dd41..ff3b3681 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,7 @@
+# Do not specify SHARED or STATIC in add_library. Let the variable BUILD_SHARED_LIBS determine this.
 
-CUDA_ADD_LIBRARY(popsift
-        popsift/popsift.cpp popsift/popsift.h
+add_library(popsift
+        popsift/popsift.cu popsift/popsift.h
         popsift/features.cu popsift/features.h
         popsift/sift_constants.cu popsift/sift_constants.h
         popsift/sift_conf.cu popsift/sift_conf.h
@@ -39,28 +40,35 @@ CUDA_ADD_LIBRARY(popsift
         popsift/common/vec_macros.h
         popsift/common/clamp.h)
 
+target_link_libraries(popsift
+                      PUBLIC
+		      CUDA::cudart
+		      Threads::Threads)
+
+if(PopSift_USE_NVTX_PROFILING)
+target_link_libraries(popsift
+                      PUBLIC
+		      CUDA::nvtx3)
+endif()
+
+set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
+set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
+set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 # build directory containing the automatically generated files
 set(popsift_generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
 # BUILD_INTERFACE allows to include the directory with source only when target is
 # built in the building tree (ie, not from an install location)
+# The CUDA install dir variable has changed from the old CUDA_INCLUDE_DIRS to the new CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
 target_include_directories(popsift
             PUBLIC
             $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
             $<BUILD_INTERFACE:${popsift_generated_dir}>
             $<BUILD_INTERFACE:${popsift_generated_dir}/popsift>
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/>
-            ${CUDA_INCLUDE_DIRS})
-
-
-set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
-set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
-
-# cannot use PRIVATE here as there is a bug in FindCUDA and CUDA_ADD_LIBRARY
-# https://gitlab.kitware.com/cmake/cmake/issues/16097
-target_link_libraries(popsift ${CUDA_CUDADEVRT_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} Threads::Threads)
-
+	    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+	    )
 
 # EXPORTING THE LIBRARY
 #
diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt
index 3b28cec8..2379c57d 100755
--- a/src/application/CMakeLists.txt
+++ b/src/application/CMakeLists.txt
@@ -42,7 +42,7 @@ find_package(DevIL COMPONENTS IL ILU) # yields IL_FOUND, IL_LIBRARIES, IL_INCLUD
 if(PopSift_BOOST_USE_STATIC_LIBS)
   set(Boost_USE_STATIC_LIBS ON)
 endif()
-find_package(Boost 1.53.0 REQUIRED COMPONENTS filesystem program_options system)
+find_package(Boost 1.71.0 REQUIRED COMPONENTS filesystem program_options system)
 if(WIN32)
   add_definitions("-DBOOST_ALL_NO_LIB")
 endif(WIN32)
@@ -60,10 +60,6 @@ else()
   set(PD_COMPILE_OPTIONS "" )
 endif()
 
-if(PopSift_USE_NVTX_PROFILING)
-  list(APPEND PD_LINK_LIBS ${CUDA_NVTX_LIBRARY})
-endif(PopSift_USE_NVTX_PROFILING)
-
 #############################################################
 # popsift-demo
 #############################################################
@@ -73,7 +69,7 @@ add_executable(popsift-demo  main.cpp pgmread.cpp pgmread.h)
 set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
 
 target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} )
-target_include_directories(popsift-demo PUBLIC ${PD_INCLUDE_DIRS})
+target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
 target_compile_definitions(popsift-demo PRIVATE ${Boost_DEFINITIONS})
 target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
@@ -87,7 +83,7 @@ add_executable(popsift-match match.cpp pgmread.cpp pgmread.h)
 set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
 
 target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} )
-target_include_directories(popsift-match PUBLIC ${PD_INCLUDE_DIRS})
+target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
 target_compile_definitions(popsift-match PRIVATE ${Boost_DEFINITIONS})
 target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
diff --git a/src/application/main.cpp b/src/application/main.cpp
index 0eec1c22..bf1128ff 100755
--- a/src/application/main.cpp
+++ b/src/application/main.cpp
@@ -30,13 +30,6 @@
 #endif
 #include "pgmread.h"
 
-#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
-#else
-#define nvtxRangePushA(a)
-#define nvtxRangePop()
-#endif
-
 using namespace std;
 
 static bool print_dev_info  = false;
@@ -183,8 +176,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
             exit( -1 );
         }
 
-        nvtxRangePushA( "load and convert image - devil" );
-
         ilImage img;
         if( img.Load( inputFile.c_str() ) == false ) {
             cerr << "Could not load image " << inputFile << endl;
@@ -200,8 +191,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
 
         image_data = img.GetData();
 
-        nvtxRangePop( ); // "load and convert image - devil"
-
         job = PopSift.enqueue( w, h, image_data );
 
         img.Clear();
@@ -209,7 +198,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
     else
 #endif
     {
-        nvtxRangePushA( "load and convert image - pgmread" );
         int w{};
         int h{};
         image_data = readPGMfile( inputFile, w, h );
@@ -217,8 +205,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
             exit( EXIT_FAILURE );
         }
 
-        nvtxRangePop( ); // "load and convert image - pgmread"
-
         if( ! float_mode )
         {
             // PopSift.init( w, h );
@@ -251,21 +237,15 @@ void read_job( SiftJob* job, bool really_write )
          << endl;
 
     if( really_write ) {
-        nvtxRangePushA( "Writing features to disk" );
-
         std::ofstream of( "output-features.txt" );
         feature_list->print( of, write_as_uchar );
     }
     delete feature_list;
-
-    if( really_write ) {
-        nvtxRangePop( ); // Writing features to disk
-    }
 }
 
 int main(int argc, char **argv)
 {
-    cudaDeviceReset();
+    popsift::cuda::reset();
 
     popsift::Config config;
     list<string>   inputFiles;
diff --git a/src/application/match.cpp b/src/application/match.cpp
index 852d9b62..3460975d 100755
--- a/src/application/match.cpp
+++ b/src/application/match.cpp
@@ -30,13 +30,6 @@
 #endif
 #include "pgmread.h"
 
-#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
-#else
-#define nvtxRangePushA(a)
-#define nvtxRangePop()
-#endif
-
 using namespace std;
 
 static bool print_dev_info  {false};
@@ -171,7 +164,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
     unsigned char* image_data;
     SiftJob* job;
 
-    nvtxRangePushA( "load and convert image" );
 #ifdef USE_DEVIL
     if( ! pgmread_loading )
     {
@@ -189,8 +181,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
         cout << "Loading " << w << " x " << h << " image " << inputFile << endl;
         image_data = img.GetData();
 
-        nvtxRangePop( );
-
         // PopSift.init( w, h );
         job = PopSift.enqueue( w, h, image_data );
 
@@ -206,8 +196,6 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
             exit( EXIT_FAILURE );
         }
 
-        nvtxRangePop( );
-
         // PopSift.init( w, h );
         job = PopSift.enqueue( w, h, image_data );
 
@@ -219,7 +207,7 @@ SiftJob* process_image( const string& inputFile, PopSift& PopSift )
 
 int main(int argc, char **argv)
 {
-    cudaDeviceReset();
+    popsift::cuda::reset();
 
     popsift::Config config;
     string         lFile{};
diff --git a/src/popsift/common/device_prop.cu b/src/popsift/common/device_prop.cu
index a55821cd..44d47c1d 100644
--- a/src/popsift/common/device_prop.cu
+++ b/src/popsift/common/device_prop.cu
@@ -14,6 +14,16 @@ namespace popsift { namespace cuda {
 
 using namespace std;
 
+void reset()
+{
+    cudaDeviceReset();
+}
+
+void sync()
+{
+    cudaDeviceSynchronize();
+}
+
 device_prop_t::device_prop_t( )
 {
     int         currentDevice;
diff --git a/src/popsift/common/device_prop.h b/src/popsift/common/device_prop.h
index 7a0b142d..8910e9c8 100644
--- a/src/popsift/common/device_prop.h
+++ b/src/popsift/common/device_prop.h
@@ -13,6 +13,15 @@
 namespace popsift {
 namespace cuda {
 
+/** A call to cudaDeviceReset()
+ */
+void reset();
+
+/** A call to cudaDeviceSynchronize()
+ */
+void sync();
+
+
 /**
  * @brief A class to recover, query and print the information about the cuda device.
  */
diff --git a/src/popsift/popsift.cpp b/src/popsift/popsift.cu
similarity index 100%
rename from src/popsift/popsift.cpp
rename to src/popsift/popsift.cu
diff --git a/src/popsift/popsift.h b/src/popsift/popsift.h
index 3b5f72b8..5654cc76 100755
--- a/src/popsift/popsift.h
+++ b/src/popsift/popsift.h
@@ -24,7 +24,7 @@
 #include <vector>
 
 #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangeStartA(a)
 #define nvtxRangeEnd(a)
diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu
index a766c2de..bfe2e64e 100644
--- a/src/popsift/s_filtergrid.cu
+++ b/src/popsift/s_filtergrid.cu
@@ -10,7 +10,7 @@
 #include "sift_pyramid.h"
 
 #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
diff --git a/src/popsift/s_image.cu b/src/popsift/s_image.cu
index f26b8e3e..a966dd39 100755
--- a/src/popsift/s_image.cu
+++ b/src/popsift/s_image.cu
@@ -16,7 +16,7 @@
 #include <iostream>
 
 #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
index f6b36fcd..b34aaaa1 100644
--- a/src/popsift/s_orientation.cu
+++ b/src/popsift/s_orientation.cu
@@ -19,7 +19,7 @@
 #include <cstdio>
 
 #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
diff --git a/src/popsift/sift_constants.h b/src/popsift/sift_constants.h
index 20b3012d..883515a7 100755
--- a/src/popsift/sift_constants.h
+++ b/src/popsift/sift_constants.h
@@ -15,20 +15,27 @@
 #ifndef NINF
 #define NINF              (-INF)
 #endif
-#ifdef M_PI
-#undef M_PI
-// #define M_PI  3.14159265358979323846f
-#endif
-__device__ static const
-float M_PI = 3.14159265358979323846f;
-#ifdef M_PI2
-#undef M_PI2
-// #define M_PI2 (2.0F * M_PI)
-#endif
-__device__ static const
-float M_PI2 = 2.0f * 3.14159265358979323846f;
 
-#define M_4RPI               (4.0f / M_PI)
+#undef USE_CONSTANT_PI
+#ifdef USE_CONSTANT_PI
+  #ifdef M_PI
+    #undef M_PI
+  #endif
+  __device__ static const float M_PI = 3.14159265358979323846f;
+  #ifdef M_PI2
+    #undef M_PI2
+  #endif
+  __device__ static const float M_PI2 = 2.0f * 3.14159265358979323846f;
+  #define M_4RPI               (4.0f / M_PI)
+#else
+  #ifndef M_PI
+    #define M_PI 3.14159265358979323846f
+  #endif
+  #ifndef M_PI2
+    #define M_PI2 (2.0F * M_PI)
+  #endif
+  #define M_4RPI               (4.0f / M_PI)
+#endif
 
 #define DESC_MIN_FLOAT               1E-15F
 
diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu
index b0eb0bd1..f533df35 100644
--- a/src/popsift/sift_desc.cu
+++ b/src/popsift/sift_desc.cu
@@ -22,7 +22,7 @@
 #include <iostream>
 
 #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()
diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu
index 06060052..c03b0d61 100644
--- a/src/popsift/sift_pyramid.cu
+++ b/src/popsift/sift_pyramid.cu
@@ -26,7 +26,7 @@
 #endif
 
 #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX)
-#include <nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCuda.h>
 #else
 #define nvtxRangePushA(a)
 #define nvtxRangePop()