Skip to content

Commit

Permalink
Merged PR 31744: Pymarian: python bindings to marian
Browse files Browse the repository at this point in the history
* This code is same as  [public github repo tg/pybind-new branch](#1013). Git histories seems slightly different between public and private repo so we are seeing a lot of commits
* This builds on top of work by Elijah #948
  • Loading branch information
Thamme Gowda authored and emjotde committed Feb 3, 2024
1 parent b5c892e commit 1c63c1e
Show file tree
Hide file tree
Showing 40 changed files with 2,035 additions and 92 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,11 @@ jobs:
./marian --version
./marian-decoder --version
./marian-scorer --version
./spm_encode --version
ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \))
- name: Install PyMarian
run: |
python3 -m pip install --upgrade pip setuptools wheel pytest
CMAKE_ARGS="" python3 -m pip install -v .
python3 -m pymarian -v
MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
11 changes: 11 additions & 0 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
-DCOMPILE_CPU=${{ matrix.cpu }} \
-DCOMPILE_CUDA=${{ matrix.gpu }} \
-DCOMPILE_EXAMPLES=${{ matrix.examples }} \
-DUSE_TCMALLOC=OFF \
-DCOMPILE_SERVER=on \
-DCOMPILE_TESTS=${{ matrix.unit_tests }} \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
Expand Down Expand Up @@ -143,3 +144,13 @@ jobs:
./marian-server --version
./spm_encode --version
ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \))
- name: Install PyMarian
working-directory: build
env:
CUDA_VERSION: ${{ matrix.cuda }}
run: |
python3 -m pip install --upgrade pip setuptools wheel pytest
CMAKE_ARGS="" python3 -m pip install -v .
python3 -m pymarian -v
MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
12 changes: 12 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,16 @@ jobs:
.\marian-decoder.exe --version
.\marian-scorer.exe --version
dir *.exe
cd ..
shell: cmd

- name: Install PyMarian
working-directory: src/python
run: |
python3 -m pip install --upgrade pip setuptools wheel pytest
python3 -m pip install -v .
python3 -m pymarian -v
python3 -m pytest -vs src/python/tests
env:
CUDA_VERSION: ${{ matrix.cuda }}
shell: cmd
11 changes: 9 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Config files from CMake
.history*
src/common/project_version.h
src/common/git_revision.h
src/common/build_info.cpp
Expand Down Expand Up @@ -48,6 +48,8 @@ pingme.txt
# CMake files
build
build-*
# pymarian wheels
dist/

# Examples
examples/*/*.gz
Expand All @@ -61,4 +63,9 @@ examples/mnist/*ubyte
/vs/MarianDll.VC.VC.opendb

.vs
.vscode
.vscode

# Python : pymarian
*.whl
*.egg-info
src/python/pymarian/_version.py
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@
[submodule "src/3rd_party/simple-websocket-server"]
path = src/3rd_party/simple-websocket-server
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/pybind11"]
path = src/3rd_party/pybind11
url = https://github.com/pybind/pybind11.git
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Fixed compilation with clang 16.0.6

### Added
- Added `pymarian`: python bindings based on pybind11
- Added implementation of COMET-KIWI
- Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
- Added implementation of COMET-22 (reference-based) model and conversion
Expand Down
7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ option(USE_MKL "Compile with MKL support" ON)
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
option(USE_TCMALLOC "Use TCMALLOC if available" ON)
option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF)

# fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
# so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
Expand Down Expand Up @@ -105,7 +107,7 @@ if(MSVC)
set(INTRINSICS "/arch:AVX2")
# set(INTRINSICS "/arch:AVX512")
# /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS "/permissive- /EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")

Expand Down Expand Up @@ -347,6 +349,7 @@ if(CUDA_FOUND)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()

message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
if(COMPILE_KEPLER)
message(STATUS "Compiling code for Kepler GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
Expand Down Expand Up @@ -412,7 +415,7 @@ if(CUDA_FOUND)
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
if(NOT CUDA_cublasLt_LIBRARY)
message(FATAL_ERROR "cuBLASLt library not found")
message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
Expand Down
27 changes: 25 additions & 2 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ stages:
-DUSE_NCCL="FALSE" ^
-DUSE_SENTENCEPIECE="TRUE" ^
-DUSE_STATIC_LIBS="TRUE"
displayName: Configure CMake
env:
# Set envvars so that CMake can find the installed packages
Expand Down Expand Up @@ -322,6 +323,10 @@ stages:
displayName: Install CUDA
condition: eq(variables.gpu, true)

# Some preinstalled versions of pip are bad for pymarian; see https://github.com/pypa/setuptools/issues/3269
- bash: python3 -m pip install pip -U
displayName: Upgrade pip

- bash: |
mkdir -p build
cd build
Expand All @@ -336,7 +341,11 @@ stages:
-DUSE_SENTENCEPIECE=on \
-DUSE_STATIC_LIBS=$(static) \
-DBoost_ARCHITECTURE=-x64 \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda)
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda) \
-DUSE_TCMALLOC=off \
-DPYMARIAN=ON \
-DPYTHON_EXECUTABLE=python3
displayName: Configure CMake
# Clean build/src/ to safe disk space on Azure-hosted VMs and stay below the 10GB limit
Expand All @@ -361,6 +370,11 @@ stages:
displayName: Print versions
workingDirectory: build
- bash: |
python3 -m pip install build/pymarian-*.whl
python3 -m pymarian -v
displayName: Build Pymarian
######################################################################
- job: BuildMacOS
cancelTimeoutInMinutes: 1
Expand Down Expand Up @@ -393,6 +407,7 @@ stages:
-DUSE_FBGEMM=on \
-DUSE_SENTENCEPIECE=on \
-DUSE_STATIC_LIBS=off
displayName: Configure CMake
- bash: make -j2
Expand Down Expand Up @@ -453,7 +468,10 @@ stages:
-DCOMPILE_CUDA=off \
-DGENERATE_MARIAN_INSTALL_TARGETS=on \
-DUSE_FBGEMM=on \
-DUSE_SENTENCEPIECE=on
-DUSE_SENTENCEPIECE=on \
-DPYMARIAN=on \
-DPYTHON_EXECUTABLE=python3
displayName: Configure CMake
- bash: make -j3 install
Expand All @@ -468,6 +486,11 @@ stages:
displayName: Check targets
workingDirectory: install
- bash: |
python3 -m pip install build/pymarian-*.whl
python3 -m pymarian -v
displayName: Build Pymarian
# Marian is built in the same job where the regression tests are run to make sure that executables
# are compiled and run on a machine with the same CPU architecture, which is required for
Expand Down
16 changes: 10 additions & 6 deletions src/3rd_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,17 @@ if(USE_SENTENCEPIECE)

# regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically
set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")

if(USE_STATIC_LIBS)
set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
else(USE_STATIC_LIBS)
set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.")
endif(USE_STATIC_LIBS)
if(USE_TCMALLOC)
set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")
if(USE_STATIC_LIBS)
set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
else(USE_STATIC_LIBS)
set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.")
endif(USE_STATIC_LIBS)
else(USE_TCMALLOC)
set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "Enable TCMalloc if available.")
endif(USE_TCMALLOC)

add_subdirectory(./sentencepiece)
include_directories(./sentencepiece)
Expand Down
1 change: 1 addition & 0 deletions src/3rd_party/pybind11
Submodule pybind11 added at 869cc1
37 changes: 31 additions & 6 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include_directories(.)
include_directories(3rd_party)
include_directories(3rd_party/SQLiteCpp/include)
include_directories(3rd_party/sentencepiece)

if(USE_SENTENCEPIECE)
include_directories(3rd_party/sentencepiece/third_party/protobuf-lite)
endif(USE_SENTENCEPIECE)
Expand Down Expand Up @@ -260,11 +261,11 @@ if (NOT COMPILE_LIBRARY_ONLY)
endif(COMPILE_SERVER)

foreach(exec ${EXECUTABLES})
target_link_libraries(${exec} marian)
if(CUDA_FOUND)
target_link_libraries(${exec} marian_cuda)
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
target_link_libraries(${exec} marian)
if(CUDA_FOUND)
target_link_libraries(${exec} marian_cuda)
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
endforeach(exec)
endif(NOT COMPILE_LIBRARY_ONLY)

Expand All @@ -282,9 +283,33 @@ endif(COMPILE_EXAMPLES)

if(GENERATE_MARIAN_INSTALL_TARGETS)
# Install the marian library if given a "make install" target
include(GNUInstallDirs) # This defines default values for installation directories (all platforms even if named GNU)
include(GNUInstallDirs) # This defines default values for installation directories (all platforms even if named GNU)
install(TARGETS marian
EXPORT marian-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif(GENERATE_MARIAN_INSTALL_TARGETS)


if(PYMARIAN)
if(NOT PYTHON_EXECUTABLE)
set(PYTHON_EXECUTABLE python) # default to python in the environment
endif()

include_directories(3rd_party/pybind11/include)
add_subdirectory(3rd_party/pybind11)

pybind11_add_module(_pymarian MODULE python/binding/bind.cpp)
target_link_libraries(_pymarian PUBLIC marian)
if(CUDA_FOUND)
target_link_libraries(_pymarian PUBLIC marian_cuda)
endif(CUDA_FOUND)
install(TARGETS _pymarian DESTINATION .)

# build pymarian wheel
add_custom_target(pymarian ALL
${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
"${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}"
DEPENDS _pymarian
VERBATIM COMMENT "Building pymarian wheel")
endif(PYMARIAN)
14 changes: 12 additions & 2 deletions src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,22 @@ std::vector<DeviceId> Config::getDevices(Ptr<Options> options,
return devices;
}

Ptr<Options>
parseOptions(int argc, char** argv, cli::mode mode, bool validate){
Ptr<Options> parseOptions(int argc, char** argv, cli::mode mode, bool validate) {
ConfigParser cp(mode);
return cp.parseOptions(argc, argv, validate);
}

Ptr<Options> parseOptions(const std::string& args, cli::mode mode, bool validate) {
std::vector<std::string> vArgs = utils::split(args, " ");

std::string dummy("marian");
std::vector<char*> cArgs = { &dummy[0] };
for(auto& arg : vArgs)
cArgs.push_back(&arg[0]);

return parseOptions((int)cArgs.size(), cArgs.data(), mode, validate);
}

std::ostream& operator<<(std::ostream& out, const Config& config) {
YAML::Emitter outYaml;
cli::OutputYaml(config.get(), outYaml);
Expand Down
15 changes: 15 additions & 0 deletions src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,19 @@ Ptr<Options> parseOptions(int argc,
cli::mode mode,
bool validate = true);

/**
* Parse the command line options.
* Same as above, but args provided as C++ string object, space-delimited. This is used for instance
* in the python bindings as a simple string-based interface.
*
* @param args space delimited command line options
* @param mode change the set of available command-line options, e.g. training, translation, etc.
* @param validate validate parsed options and abort on failure
*
* @return parsed options
*/
Ptr<Options> parseOptions(const std::string& args,
cli::mode mode,
bool validate = true);

} // namespace marian
18 changes: 11 additions & 7 deletions src/common/logging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ std::shared_ptr<spdlog::logger> createStderrLogger(const std::string& name,
const std::string& pattern,
const std::vector<std::string>& files,
bool quiet) {
std::vector<spdlog::sink_ptr> sinks;
auto logger = spdlog::get(name);
if(!logger) {
std::vector<spdlog::sink_ptr> sinks;

auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
if(!quiet)
sinks.push_back(stderr_sink);
auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
if(!quiet)
sinks.push_back(stderr_sink);

// @TODO: think how to solve this better than using OMPI_COMM_WORLD_RANK env variable
// only create output files if we are the main process or if MPI rank is not defined
Expand All @@ -42,10 +44,11 @@ std::shared_ptr<spdlog::logger> createStderrLogger(const std::string& name,
}
}

auto logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));

spdlog::register_logger(logger);
logger->set_pattern(pattern);
spdlog::register_logger(logger);
logger->set_pattern(pattern);
}
return logger;
}

Expand All @@ -72,6 +75,7 @@ bool setLoggingLevel(spdlog::logger& logger, std::string const level) {
}

static void setErrorHandlers();

void createLoggers(const marian::Config* config) {
std::vector<std::string> generalLogs;
std::vector<std::string> validLogs;
Expand Down
Loading

0 comments on commit 1c63c1e

Please sign in to comment.