marian-nmt · graemenail · May 5, 2022 · May 5, 2022 · May 5, 2022 · May 5, 2022
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -74,13 +74,6 @@ jobs:
     - name: Install dependencies
       run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev gcc-${{ env.gcc_version }} g++-${{ env.gcc_version }}
 
-    # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
-    - name: Install MKL
-      run: |
-        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
-        sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
-        sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
-        sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
     # The script simplifies installation of different versions of CUDA
     - name: Install CUDA
       run: ./scripts/ci/install_cuda_ubuntu.sh ${{ env.cuda_version }}
@@ -156,15 +149,6 @@ jobs:
       with:
         submodules: recursive
 
-    - name: Download MKL
-      run: |
-        C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip
-        Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl
-        # Set the MKLROOT environment variable so that CMake can find MKL.
-        # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners
-        echo "MKLROOT=$env:GITHUB_WORKSPACE/mkl" | Out-File -FilePath $env:GITHUB_ENV -Append
-      shell: pwsh
-
     - name: Install CUDA
       run: |
         .\scripts\ci\install_cuda_windows.ps1 '${{ env.cuda_version }}'

diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
@@ -69,15 +69,6 @@ jobs:
         sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev \
           gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }}
 
-    # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
-    - name: Install MKL
-      run: |
-        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
-        sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
-        sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
-        sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
-      if: matrix.cpu == true
-
     # The script simplifies installation of different versions of CUDA
     - name: Install CUDA
       run: ./scripts/ci/install_cuda_ubuntu.sh ${{ matrix.cuda }}
@@ -122,4 +113,3 @@ jobs:
         ./marian-scorer --version
         ./marian-server --version
         ./spm_encode --version
-
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -7,7 +7,6 @@ on:
     branches: [ master ]
 
 env:
-  MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip"
   BOOST_ROOT: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64"
   BOOST_URL: "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe"
 
@@ -34,15 +33,6 @@ jobs:
       with:
         submodules: recursive
 
-    - name: Download MKL
-      run: |
-        # Wget retries downloading files and is faster than Invoke-WebRequest
-        C:\msys64\usr\bin\wget.exe -nv ${{ env.MKL_URL }} -O mkl.zip
-        Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl
-        # Set MKLROOT environment variable so that CMake can find MKL
-        echo "MKLROOT=${{ github.workspace }}\mkl" | Out-File -FilePath $env:GITHUB_ENV  -Encoding utf8 -Append
-      shell: powershell
-
     - name: Install CUDA
       run: |
         .\scripts\ci\install_cuda_windows.ps1 "10.2"
@@ -51,9 +41,17 @@ jobs:
         echo "$env:CUDA_PATH/bin"       | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
       shell: powershell
       if: matrix.gpu == true
+    # Cache boost install 
+    - name: Cache Boost
+      id: cache-boost
+      uses: actions/cache@v3
+      with:
+        path: ${{ env.BOOST_ROOT }}
+        key: ${{ runner.os }}-${{ env.BOOST_URL }}
 
     # Boost is no longer pre-installed on GitHub-hosted Windows runners
     - name: Download Boost
+      if: ${{ steps.cache-boost.outputs.cache-hit != 'true' }}
       run: |
         Write-Host "Downloading Boost to ${{ env.BOOST_ROOT }}"
         C:\msys64\usr\bin\wget.exe -nv "${{ env.BOOST_URL }}" -O "${{ github.workspace }}/boost.exe"
@@ -70,6 +68,7 @@ jobs:
 
     # Windows CUDA builds use USE_NCCL=off due to compilation errors.
     - name: Build Debug
+      id: build-debug
       uses: lukka/run-cmake@v3
       with:
         buildDirectory: ${{ github.workspace }}/build/Debug
@@ -95,6 +94,11 @@ jobs:
       # able to find sometimes.
       if: matrix.gpu == true
 
+    - name: Cleanup Debug
+      if: steps.build-debug.conclusion == 'success'
+      working-directory: ${{ github.workspace }}/build/Debug
+      run: cmake --build . --target clean
+
     # Windows CUDA builds use USE_NCCL=off due to compilation errors
     - name: Build Release
       uses: lukka/run-cmake@v3

diff --git a/.gitmodules b/.gitmodules
@@ -20,3 +20,6 @@
 [submodule "src/3rd_party/simple-websocket-server"]
 	path = src/3rd_party/simple-websocket-server
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
+[submodule "src/3rd_party/oneDNN"]
+	path = src/3rd_party/oneDNN
+	url = https://github.com/oneapi-src/oneDNN.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- oneDNN is used for GEMM on CPU.
 
 ### Fixed
 - Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast)
@@ -29,6 +30,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Faster LSH top-k search on CPU
 - Updated intgemm to the latest upstream version
 - Parameters in npz files are no longer implicitly assumed to be row-ordered. Non row-ordered parameters will result in an abort
+- MKL is no longer used as a backend for the CPU.
 
 ## [1.11.0] - 2022-02-08
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,7 +25,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
-option(USE_MKL "Compile with MKL support" ON)
+option(USE_DNNL "Compile with oneDNN support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
@@ -84,6 +84,7 @@ endif()
 # Set compilation flags
 if(MSVC)
 # These are used in src/CMakeLists.txt on a per-target basis
+  list(APPEND EXTRA_DEFINITIONS  /DUNICODE /D_UNICODE)
   list(APPEND ALL_WARNINGS /WX; /W4;)
 
   # Disabled bogus warnings for CPU intrinsics and Protobuf:
@@ -105,7 +106,7 @@ if(MSVC)
   set(INTRINSICS "/arch:AVX2")
   # set(INTRINSICS "/arch:AVX512")
   # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
-  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
+  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
   set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
   set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
 
@@ -509,6 +510,11 @@ if(COMPILE_CPU)
     set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
     add_definitions(-DCOMPILE_CPU=1)
   endif()
+
+  if(USE_DNNL)
+    set(EXT_LIBS ${EXT_LIBS} dnnl)
+    add_definitions(-DDNNL_FOUND=1)
+  endif(USE_DNNL)
   if(USE_APPLE_ACCELERATE)
     if(NOT APPLE)
       message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")
@@ -520,15 +526,6 @@ if(COMPILE_CPU)
     set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
     add_definitions(-DBLAS_FOUND=1)
   else(USE_APPLE_ACCELERATE)
-    if(USE_MKL)
-      find_package(MKL)
-    endif(USE_MKL)
-    if(MKL_FOUND)
-      include_directories(${MKL_INCLUDE_DIR})
-      set(EXT_LIBS ${EXT_LIBS} ${MKL_LIBRARIES})
-      set(BLAS_FOUND TRUE)
-      add_definitions(-DBLAS_FOUND=1 -DMKL_FOUND=1)
-    else(MKL_FOUND)
       set(BLAS_VENDOR "OpenBLAS")
       find_package(BLAS)
       if(BLAS_FOUND)
@@ -539,7 +536,6 @@ if(COMPILE_CPU)
           add_definitions(-DBLAS_FOUND=1)
         endif(CBLAS_FOUND)
       endif(BLAS_FOUND)
-    endif(MKL_FOUND)
   endif(USE_APPLE_ACCELERATE)
 endif(COMPILE_CPU)
 

diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake
diff --git a/doc/operators.md b/doc/operators.md
@@ -382,7 +382,7 @@ libraries containing device-specific optimisations. These libraries include:
     - CBLAS / OpenBLAS
     - FBGEMM
     - INTGEMM
-    - MKL
+    - oneDNN
   - GPU
     - CUDA (cuBLAS)
 

diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
@@ -15,6 +15,25 @@ if(COMPILE_CPU)
   endif()
 endif(COMPILE_CPU)
 
+if(USE_DNNL)
+  # OneDNN
+  set(DNNL_BUILD_TESTS OFF CACHE BOOL "Build dnnl tests")
+  set(DNNL_BUILD_EXAMPLES OFF CACHE BOOL "Build dnnl examples")
+
+  set(DNNL_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
+  if(USE_STATIC_LIBS)
+    set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "specifies whether oneDNN library should be SHARED or STATIC" FORCE)
+  endif(USE_STATIC_LIBS)
+
+  if(NOT USE_OPENMP)
+    set(DNNL_CPU_RUNTIME SEQ CACHE INTERNAL "" FORCE)
+  endif()
+
+  add_subdirectory(./oneDNN)
+  include_directories(./oneDNN/include)
+
+endif(USE_DNNL)
+
 if(USE_FBGEMM)
   # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
   # meant to silence CMakeFiles of 3rd_party tools.
@@ -169,7 +188,7 @@ if(CUDA_FOUND)
     endif(COMPILE_AMPERE)
 
     # install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation
-    # Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16. 
+    # Using $(MAKE) instead of $CMAKE_MAKE_PROGRAM in order to make parallelization in NCCL compilation work with make -j16.
     # Apparently this does not get properly propagated otherwise and builts with only a single thread/process.
     ExternalProject_Add(nccl_install
       SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl

diff --git a/src/3rd_party/oneDNN b/src/3rd_party/oneDNN
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -130,7 +130,7 @@ set(MARIAN_SOURCES
 
 add_library(marian STATIC ${MARIAN_SOURCES})
 
-target_compile_options(marian PRIVATE ${ALL_WARNINGS})
+target_compile_options(marian PRIVATE ${ALL_WARNINGS} ${EXTRA_DEFINITIONS})
 
 # Generate git_revision.h to reflect current git revision information
 # [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake]
-Original file line number
+Diff line change
@@ Expand Up @@
         - CBLAS / OpenBLAS
         - FBGEMM
         - INTGEMM
-        - MKL
+        - oneDNN
       - GPU
         - CUDA (cuBLAS)
@@ Expand Down @@