Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge with internal master - 2024-08-05 #1026

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a728daa
Merged PR 31742: Fix docker url security: use microsoft cr
Nov 13, 2023
6fe9a80
Merged PR 31906: Updates to CI pipeline: new vcpkg and options to dis…
Nov 24, 2023
72c8d60
Merged PR 31918: Update MKL in GPU regression tests
Nov 27, 2023
a7cc324
Merged PR 31730: ALIBI with shifts
emjotde Dec 2, 2023
b61755b
Merged PR 31919: Nucleus and epsilon sampling
emjotde Dec 4, 2023
5e47ab2
Merged PR 32433: Fix Logmask in BLEURT model
emjotde Jan 6, 2024
fa06754
Merged PR 32547: Add support for sparsemax and comet-22 (not kiwi yet)
emjotde Jan 17, 2024
7dcebfb
Merged PR 32567: Refactoring of Graph loading and mmapping interface
emjotde Jan 20, 2024
1656b9c
Merged PR 32600: Full Comet-Kiwi implementation, partial xComet-XL/XXL
emjotde Jan 24, 2024
b5c892e
Merged PR 32781: Attach missing node for mt-detect models
emjotde Feb 1, 2024
1c63c1e
Merged PR 31744: Pymarian: python bindings to marian
Feb 3, 2024
5e6e1a0
Merged PR 32806: Various small changes and fixes to pybindings and py…
emjotde Feb 5, 2024
4cdf93a
Merged PR 32860: Azure CI: save disk space by disabling compilation f…
Feb 6, 2024
bd9a679
Merged PR 32636: Extending new layer framework to match production mo…
emjotde Feb 6, 2024
b683f4b
Merged PR 32882: Reorder inputs for kiwi-style metrics
emjotde Feb 8, 2024
22ed792
Merged PR 32937: Fixes force-decoding for beam-size larger 1
emjotde Feb 8, 2024
9e40ac3
Merged PR 32883: Pymarian improvements
Feb 15, 2024
39ade68
Merged PR 33078: Merge public master with internal master
emjotde Feb 22, 2024
01bc6b0
Merged PR 33010: support force-decoding for pymarian Translator API
Feb 22, 2024
4d184bb
Merged PR 33382: handle cusparse deprecation warnings with cuda 12.3
emjotde Mar 5, 2024
00ff086
Merged PR 33692: Add --no-optimizer-reload option
emjotde Apr 2, 2024
58a9150
Merged PR 33803: Fixes to force-decoding to enable LSH
emjotde Apr 16, 2024
b4ed630
Merged PR 34062: Add exception if force-decoding is used for FSM vocab
emjotde Apr 17, 2024
2745b77
Merged PR 34029: Fix regressions in new layer framework for ALIBI-bas…
emjotde Apr 20, 2024
07042cf
Merged PR 34167: Do not mmap files for conversion in Quicksand API
emjotde Apr 23, 2024
a6ab8af
Merged PR 34540: pymarian: build for multiple python versions; disabl…
Jun 27, 2024
2f9b6df
merge with internal master
emjotde Aug 5, 2024
be50e88
temporarily disable pymarian compilation
Aug 5, 2024
deb387e
Temporarily disable PyMarian and examples in GitHub workflows
Aug 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/regression-tests
/build*
/.pytest_cache
/.vscode
/dist
/doc
.history*
8 changes: 7 additions & 1 deletion .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,11 @@ jobs:
./marian --version
./marian-decoder --version
./marian-scorer --version
./spm_encode --version
ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \))

# - name: Install PyMarian
# run: |
# python3 -m pip install --upgrade pip setuptools wheel pytest
# CMAKE_ARGS="" python3 -m pip install -v .
# python3 -m pymarian -v
# MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
13 changes: 12 additions & 1 deletion .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
cpu: false
gpu: true
unit_tests: false
examples: true
examples: false
# Ubuntu 22.04 supports CUDA 11.7
# Unit tests and examples are not compiled to save disk space
- name: "Ubuntu 22.04 CUDA 11.7 gcc-11"
Expand Down Expand Up @@ -115,6 +115,7 @@ jobs:
-DCOMPILE_CPU=${{ matrix.cpu }} \
-DCOMPILE_CUDA=${{ matrix.gpu }} \
-DCOMPILE_EXAMPLES=${{ matrix.examples }} \
-DUSE_TCMALLOC=OFF \
-DCOMPILE_SERVER=on \
-DCOMPILE_TESTS=${{ matrix.unit_tests }} \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
Expand Down Expand Up @@ -143,3 +144,13 @@ jobs:
./marian-server --version
./spm_encode --version
ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \))

# - name: Install PyMarian
# working-directory: build
# env:
# CUDA_VERSION: ${{ matrix.cuda }}
# run: |
# python3 -m pip install --upgrade pip setuptools wheel pytest
# CMAKE_ARGS="" python3 -m pip install -v .
# python3 -m pymarian -v
# MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
12 changes: 12 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,16 @@ jobs:
.\marian-decoder.exe --version
.\marian-scorer.exe --version
dir *.exe
cd ..
shell: cmd

# - name: Install PyMarian
# working-directory: src/python
# run: |
# python3 -m pip install --upgrade pip setuptools wheel pytest
# python3 -m pip install -v .
# python3 -m pymarian -v
# python3 -m pytest -vs src/python/tests
# env:
# CUDA_VERSION: ${{ matrix.cuda }}
# shell: cmd
15 changes: 13 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Config files from CMake
.history*
src/common/project_version.h
src/common/git_revision.h
src/common/build_info.cpp
Expand Down Expand Up @@ -48,6 +48,8 @@ pingme.txt
# CMake files
build
build-*
# pymarian wheels
dist/

# Examples
examples/*/*.gz
Expand All @@ -61,4 +63,13 @@ examples/mnist/*ubyte
/vs/MarianDll.VC.VC.opendb

.vs
.vscode
.vscode

# Python : pymarian
*.whl
*.egg-info
src/python/pymarian/_version.py
src/python/tests/data
__pycache__
.pytest_cache

7 changes: 5 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/ruy"]
path = src/3rd_party/ruy
url = https://github.com/google/ruy.git
url = https://github.com/marian-nmt/ruy.git
[submodule "src/3rd_party/simd_utils"]
path = src/3rd_party/simd_utils
url = https://github.com/JishinMaster/simd_utils.git
url = https://github.com/marian-nmt/simd_utils.git
[submodule "src/3rd_party/pybind11"]
path = src/3rd_party/pybind11
url = https://github.com/pybind/pybind11.git
23 changes: 21 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
- Fixed compilation with clang 16.0.6
- Added Threads::Threads to EXT_LIBS

- Added Threads::Threads to `EXT_LIBS`
- Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace

### Added
- Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size.
- Added `--no-optimizer-reload` to skip optimizer state loading during continued training or fallback.
- Added `pymarian-eval`, CLI for scoring metrics
- Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
- Added `pymarian`: python bindings based on pybind11
- Added implementation of COMET-KIWI
- Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
- Added implementation of COMET-22 (reference-based) model and conversion
- Added sparsemax operator (slow version)
- Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively.
- Added ALIBI related options to new layer framework.
- Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
- Added MSE and MAE costs to COMET-QE training.
- Added augmentation of shuffled examples to COMET-QE training via `--comet-augment-bad`.
Expand All @@ -29,6 +40,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- New experimental layer framework for Transformer-like models.

### Fixed
- Do not mmap files for conversion via Quicksand API
- Fixed ALiBI states and caching in new layer framework
- Throw exception when forcing with FS vocabs
- Fixed force-decoding with LSH
- Fixed force-decoding for beam-size > 1
- Fixed lost node in mt-detect metrics
- Fixed BLEURT logmask computation
- Fixed wrong paramter name for norm in new layer framework
- Fixed unit test for LayerNorm
- Only collect batch statistics during mini-batch-fit up to actual max-length.
Expand All @@ -37,6 +55,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp).

### Changed
- Refactoring of model loading, mmapping happens now opportunistically, --mmap-models for decoding forces mmap and croaks if not possible.
- Removed --num-devices N option that wasn't really used by anyone (I assume).


Expand Down
60 changes: 38 additions & 22 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
option(USE_TCMALLOC "Use TCMALLOC if available" ON)
option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF)

# fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
# so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
Expand Down Expand Up @@ -121,6 +123,12 @@
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
find_package(Threads REQUIRED)
set(EXT_LIBS ${EXT_LIBS} Threads::Threads)

# disable tcmalloc if pymarian=on
if(USE_TCMALLOC AND PYMARIAN)
message(WARNING "TCMalloc can cause segfaults with some python libraries. Hence disabling TCMalloc for a robust pymarian build.")
set(USE_TCMALLOC off)
endif()
########

###############################################################################
Expand Down Expand Up @@ -148,7 +156,7 @@
set(INTRINSICS "/arch:AVX2")
# set(INTRINSICS "/arch:AVX512")
# /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS "/permissive- /EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")

Expand Down Expand Up @@ -286,8 +294,8 @@
set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
endif(CMAKE_COMPILER_IS_GNUCC)

set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE}")
Expand All @@ -297,7 +305,7 @@

# these need to be set separately
set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE}")
Expand Down Expand Up @@ -399,6 +407,7 @@
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()

message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
if(COMPILE_KEPLER)
message(STATUS "Compiling code for Kepler GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
Expand Down Expand Up @@ -464,7 +473,7 @@
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
if(NOT CUDA_cublasLt_LIBRARY)
message(FATAL_ERROR "cuBLASLt library not found")
message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
Expand Down Expand Up @@ -513,21 +522,26 @@
endif(CUDA_FOUND)

else(COMPILE_CUDA)
message(WARNING "COMPILE_CUDA=off : Building only CPU version")

Check warning on line 525 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU-only

COMPILE_CUDA=off : Building only CPU version
endif(COMPILE_CUDA)

# TODO: make compatible with older CUDA versions
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
else(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
if(NOT MSVC)
# @TODO: add warnings here too
list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
else()
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
# c++17 doesn't work with CUDA 10
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++17; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
else()
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++14; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
endif()
endif()

list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS)
Expand All @@ -543,20 +557,22 @@
endif()

###############################################################################
# Find Tcmalloc_minimal
# Find Tcmalloc_minimal
# re-used from sentencepiece
if(NOT WIN32)
if(USE_STATIC_LIBS)
find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
else()
find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
endif()
if (TCMALLOC_LIB)
message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
else()
message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
if(USE_TCMALLOC)
if(NOT WIN32)
if(USE_STATIC_LIBS)
find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
else()
find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
endif()
if (TCMALLOC_LIB)
message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
else()
message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
endif()
endif()
endif()

Expand Down Expand Up @@ -632,7 +648,7 @@
endif()

if(DETERMINISTIC)
message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down")

Check warning on line 651 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU-only

Option DETERMINISTIC=ON: Trying to make training as deterministic as

Check warning on line 651 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU+CUDA

Option DETERMINISTIC=ON: Trying to make training as deterministic as

Check warning on line 651 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU+CUDA

Option DETERMINISTIC=ON: Trying to make training as deterministic as
add_definitions(-DDETERMINISTIC=1)
list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; )
else()
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v1.12.14
v1.12.31
Loading
Loading