marian-nmt · emjotde · Nov 13, 2023 · Nov 24, 2023 · Nov 27, 2023 · Dec 2, 2023
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,7 @@
+/regression-tests
+/build*
+/.pytest_cache
+/.vscode
+/dist
+/doc
+.history*
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
@@ -49,5 +49,11 @@ jobs:
         ./marian --version
         ./marian-decoder --version
         ./marian-scorer --version
-        ./spm_encode --version
         ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \))
+
+    # - name: Install PyMarian
+    #   run: |
+    #     python3 -m pip install --upgrade pip setuptools wheel pytest
+    #     CMAKE_ARGS="" python3 -m pip install -v .
+    #     python3 -m pymarian -v
+    #     MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
@@ -40,7 +40,7 @@ jobs:
             cpu: false
             gpu: true
             unit_tests: false
-            examples: true
+            examples: false
           # Ubuntu 22.04 supports CUDA 11.7
           # Unit tests and examples are not compiled to save disk space
           - name: "Ubuntu 22.04 CUDA 11.7 gcc-11"
@@ -115,6 +115,7 @@ jobs:
           -DCOMPILE_CPU=${{ matrix.cpu }} \
           -DCOMPILE_CUDA=${{ matrix.gpu }} \
           -DCOMPILE_EXAMPLES=${{ matrix.examples }} \
+          -DUSE_TCMALLOC=OFF \
           -DCOMPILE_SERVER=on \
           -DCOMPILE_TESTS=${{ matrix.unit_tests }} \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
@@ -143,3 +144,13 @@ jobs:
         ./marian-server --version
         ./spm_encode --version
         ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \))
+
+    # - name: Install PyMarian
+    #   working-directory: build
+    #   env:
+    #     CUDA_VERSION: ${{ matrix.cuda }}
+    #   run: |
+    #     python3 -m pip install --upgrade pip setuptools wheel pytest
+    #     CMAKE_ARGS="" python3 -m pip install -v .
+    #     python3 -m pymarian -v
+    #     MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -134,4 +134,16 @@ jobs:
         .\marian-decoder.exe --version
         .\marian-scorer.exe --version
         dir *.exe
+        cd ..
       shell: cmd
+
+    # - name: Install PyMarian
+    #   working-directory: src/python
+    #   run: |
+    #     python3 -m pip install --upgrade pip setuptools wheel pytest
+    #     python3 -m pip install -v .
+    #     python3 -m pymarian -v
+    #     python3 -m pytest -vs src/python/tests
+    #   env:
+    #     CUDA_VERSION: ${{ matrix.cuda }}
+    #   shell: cmd
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-# Config files from CMake
+.history*
 src/common/project_version.h
 src/common/git_revision.h
 src/common/build_info.cpp
@@ -48,6 +48,8 @@ pingme.txt
 # CMake files
 build
 build-*
+# pymarian wheels
+dist/
 
 # Examples
 examples/*/*.gz
@@ -61,4 +63,13 @@ examples/mnist/*ubyte
 /vs/MarianDll.VC.VC.opendb
 
 .vs
-.vscode
+.vscode
+
+# Python : pymarian
+*.whl
+*.egg-info
+src/python/pymarian/_version.py
+src/python/tests/data
+__pycache__
+.pytest_cache
+
diff --git a/.gitmodules b/.gitmodules
@@ -22,7 +22,10 @@
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
 [submodule "src/3rd_party/ruy"]
 	path = src/3rd_party/ruy
-	url = https://github.com/google/ruy.git
+	url = https://github.com/marian-nmt/ruy.git
 [submodule "src/3rd_party/simd_utils"]
 	path = src/3rd_party/simd_utils
-	url = https://github.com/JishinMaster/simd_utils.git
+	url = https://github.com/marian-nmt/simd_utils.git
+[submodule "src/3rd_party/pybind11"]
+	path = src/3rd_party/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,10 +7,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 - Fixed compilation with clang 16.0.6
-- Added Threads::Threads to EXT_LIBS
-
+- Added Threads::Threads to `EXT_LIBS`
+- Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace
 
 ### Added
+- Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size.
+- Added `--no-optimizer-reload` to skip optimizer state loading during continued training or fallback.
+- Added `pymarian-eval`, CLI for scoring metrics
+- Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
+- Added `pymarian`: python bindings based on pybind11
+- Added implementation of COMET-KIWI
+- Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
+- Added implementation of COMET-22 (reference-based) model and conversion
+- Added sparsemax operator (slow version)
+- Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively.
+- Added ALIBI related options to new layer framework.
 - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
 - Added MSE and MAE costs to COMET-QE training.
 - Added augmentation of shuffled examples to COMET-QE training via `--comet-augment-bad`.
@@ -29,6 +40,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Do not mmap files for conversion via Quicksand API
+- Fixed ALiBI states and caching in new layer framework
+- Throw exception when forcing with FS vocabs
+- Fixed force-decoding with LSH
+- Fixed force-decoding for beam-size > 1
+- Fixed lost node in mt-detect metrics
+- Fixed BLEURT logmask computation
 - Fixed wrong paramter name for norm in new layer framework
 - Fixed unit test for LayerNorm
 - Only collect batch statistics during mini-batch-fit up to actual max-length.
@@ -37,6 +55,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp).
 
 ### Changed
+- Refactoring of model loading, mmapping happens now opportunistically, --mmap-models for decoding forces mmap and croaks if not possible.
 - Removed --num-devices N option that wasn't really used by anyone (I assume).
 
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,9 +30,11 @@
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
+option(USE_TCMALLOC "Use TCMALLOC if available" ON)
 option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
 option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
+option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF)
 
 # fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
 # so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
@@ -121,6 +123,12 @@
 set(THREADS_PREFER_PTHREAD_FLAG TRUE)
 find_package(Threads REQUIRED)
 set(EXT_LIBS ${EXT_LIBS} Threads::Threads)
+
+# disable tcmalloc if pymarian=on
+if(USE_TCMALLOC AND PYMARIAN)
+  message(WARNING "TCMalloc can cause segfaults with some python libraries. Hence disabling TCMalloc for a robust pymarian build.")
+  set(USE_TCMALLOC off)
+endif()
 ########
 
 ###############################################################################
@@ -148,7 +156,7 @@
   set(INTRINSICS "/arch:AVX2")
   # set(INTRINSICS "/arch:AVX512")
   # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
-  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
+  set(CMAKE_CXX_FLAGS           "/permissive- /EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
   set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
   set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
 
@@ -286,8 +294,8 @@
     set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
   endif(CMAKE_COMPILER_IS_GNUCC)
 
-  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
-  set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_CXX_FLAGS                 "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
+  set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_SLIM            "-O3 -funroll-loops -DNDEBUG")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
@@ -297,7 +305,7 @@
 
   # these need to be set separately
   set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
-  set(CMAKE_C_FLAGS_RELEASE         "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_SLIM            "-O3 -funroll-loops -DNDEBUG")
   set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
@@ -399,6 +407,7 @@
     LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
   endif()
 
+  message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
   if(COMPILE_KEPLER)
     message(STATUS "Compiling code for Kepler GPUs")
     LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
@@ -464,7 +473,7 @@
     if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
       find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
       if(NOT CUDA_cublasLt_LIBRARY)
-        message(FATAL_ERROR "cuBLASLt library not found")
+        message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
       endif()
       set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
       set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
@@ -513,21 +522,26 @@
 endif(CUDA_FOUND)

 else(COMPILE_CUDA)
  message(WARNING "COMPILE_CUDA=off : Building only CPU version")
 endif(COMPILE_CUDA)
 
 # TODO: make compatible with older CUDA versions
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
+  list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
 else(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
+  list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
 endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
 if(NOT MSVC)
   # @TODO: add warnings here too
-  list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
+  list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
   list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
 else()
-  list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
+  # c++17 doesn't work with CUDA 10
+  if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++17; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
+  else()
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++14; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
+  endif()
 endif()
 
 list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS)
@@ -543,20 +557,22 @@
 endif()
 
 ###############################################################################
-# Find Tcmalloc_minimal 
+# Find Tcmalloc_minimal
 # re-used from sentencepiece
-if(NOT WIN32)
-  if(USE_STATIC_LIBS)
-    find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
-  else()
-    find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
-  endif()
-  if (TCMALLOC_LIB)
-    message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
-    set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
-    add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
-  else()
-    message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
+if(USE_TCMALLOC)
+  if(NOT WIN32)
+    if(USE_STATIC_LIBS)
+      find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
+    else()
+      find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
+    endif()
+    if (TCMALLOC_LIB)
+      message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
+      set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
+      add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
+    else()
+      message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
+    endif()
   endif()
 endif()
 
@@ -632,7 +648,7 @@
 endif()

 if(DETERMINISTIC)
  message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down")
  add_definitions(-DDETERMINISTIC=1)
  list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; )
 else()

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-v1.12.14
+v1.12.31