From 7a577a781f5f2bf9523361962df1ec22cce23912 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 00:04:02 +0200 Subject: [PATCH 01/22] Add ROCm (AMDGPU) support --- src/MPI.jl | 1 + src/buffers.jl | 6 ++++-- src/rocm.jl | 21 +++++++++++++++++++++ test/Project.toml | 1 + test/runtests.jl | 5 +++++ test/test_allgather.jl | 3 +++ test/test_allgatherv.jl | 3 +++ test/test_allreduce.jl | 3 +++ test/test_alltoall.jl | 3 +++ test/test_alltoallv.jl | 3 +++ test/test_basic.jl | 3 +++ test/test_bcast.jl | 3 +++ test/test_exscan.jl | 3 +++ test/test_gather.jl | 3 +++ test/test_gatherv.jl | 3 +++ test/test_io.jl | 3 +++ test/test_io_shared.jl | 3 +++ test/test_io_subarray.jl | 3 +++ test/test_onesided.jl | 2 +- test/test_reduce.jl | 3 +++ test/test_scan.jl | 3 +++ test/test_scatter.jl | 3 +++ test/test_scatterv.jl | 3 +++ test/test_sendrecv.jl | 3 +++ test/test_subarray.jl | 3 +++ test/test_test.jl | 3 +++ test/test_threads.jl | 3 +++ test/test_wait.jl | 3 +++ 28 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 src/rocm.jl diff --git a/src/MPI.jl b/src/MPI.jl index 72652eebb..dcf6e2518 100644 --- a/src/MPI.jl +++ b/src/MPI.jl @@ -132,6 +132,7 @@ function __init__() run_load_time_hooks() + @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" include("rocm.jl") @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" include("cuda.jl") end diff --git a/src/buffers.jl b/src/buffers.jl index ffd8be183..0bf90a31b 100644 --- a/src/buffers.jl +++ b/src/buffers.jl @@ -44,6 +44,7 @@ Currently supported are: - `Array` - `SubArray` - `CUDA.CuArray` if CUDA.jl is loaded. + - `AMDGPU.ROCArray` if AMDGPU.jl is loaded. Additionally, certain sentinel values can be used, e.g. `MPI_IN_PLACE` or `MPI_BOTTOM`. """ @@ -102,8 +103,9 @@ and `datatype`. Methods are provided for - `Ref` - `Array` - - `CUDA.CuArray` if CUDA.jl is loaded - - `SubArray`s of an `Array` or `CUDA.CuArray` where the layout is contiguous, sequential or + - `CUDA.CuArray` if CUDA.jl is loaded. + - `AMDGPU.ROCArray` if AMDGPU.jl is loaded. + - `SubArray`s of an `Array`, `CUDA.CuArray` or `AMDGPU.ROCArray` where the layout is contiguous, sequential or blocked. # See also diff --git a/src/rocm.jl b/src/rocm.jl new file mode 100644 index 000000000..3a9fd913e --- /dev/null +++ b/src/rocm.jl @@ -0,0 +1,21 @@ +import .AMDGPU + +function Base.cconvert(::Type{MPIPtr}, A::AMDGPU.ROCArray{T}) where T + Base.cconvert(Ptr{T}, A.buf.ptr) # returns DeviceBuffer +end + +function Base.unsafe_convert(::Type{MPIPtr}, X::AMDGPU.ROCArray{T}) where T + reinterpret(MPIPtr, Base.unsafe_convert(Ptr{T}, X.buf.ptr)) +end + +# only need to define this for strided arrays: all others can be handled by generic machinery +function Base.unsafe_convert(::Type{MPIPtr}, V::SubArray{T,N,P,I,true}) where {T,N,P<:AMDGPU.ROCArray,I} + X = parent(V) + pX = Base.unsafe_convert(Ptr{T}, X) + pV = pX + ((V.offset1 + V.stride1) - first(LinearIndices(X)))*sizeof(T) + return reinterpret(MPIPtr, pV) +end + +function Buffer(arr::AMDGPU.ROCArray) + Buffer(arr, Cint(length(arr)), Datatype(eltype(arr))) +end diff --git a/test/Project.toml b/test/Project.toml index 8e7a1d404..b4c80fd5c 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 1b4a26c01..7ec381769 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,6 +9,11 @@ if get(ENV, "JULIA_MPI_TEST_ARRAYTYPE", "") == "CuArray" CUDA.version() CUDA.precompile_runtime() ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + AMDGPU.versioninfo() + AMDGPU.default_device() # DEBUG: something else may be needed here. + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_allgather.jl b/test/test_allgather.jl index ceece8501..26df23b0a 100644 --- a/test/test_allgather.jl +++ b/test/test_allgather.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_allgatherv.jl b/test/test_allgatherv.jl index e8fa2c5fd..9563a6e93 100644 --- a/test/test_allgatherv.jl +++ b/test/test_allgatherv.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_allreduce.jl b/test/test_allreduce.jl index dd252d06e..6e792f5cb 100644 --- a/test/test_allreduce.jl +++ b/test/test_allreduce.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_alltoall.jl b/test/test_alltoall.jl index 41f89bb46..dd0ac3148 100644 --- a/test/test_alltoall.jl +++ b/test/test_alltoall.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_alltoallv.jl b/test/test_alltoallv.jl index c7996385a..fb8a4b834 100644 --- a/test/test_alltoallv.jl +++ b/test/test_alltoallv.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_basic.jl b/test/test_basic.jl index e9326c4ec..7792eeed8 100644 --- a/test/test_basic.jl +++ b/test/test_basic.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_bcast.jl b/test/test_bcast.jl index 3bb374294..fcc6fbbf9 100644 --- a/test/test_bcast.jl +++ b/test/test_bcast.jl @@ -5,6 +5,9 @@ using Random if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_exscan.jl b/test/test_exscan.jl index a3b675447..cf215526e 100644 --- a/test/test_exscan.jl +++ b/test/test_exscan.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_gather.jl b/test/test_gather.jl index e55b8e0bd..b5f4d436e 100644 --- a/test/test_gather.jl +++ b/test/test_gather.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_gatherv.jl b/test/test_gatherv.jl index 44ee44b0b..459432d5d 100644 --- a/test/test_gatherv.jl +++ b/test/test_gatherv.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_io.jl b/test/test_io.jl index 915ed116a..3c6700695 100644 --- a/test/test_io.jl +++ b/test/test_io.jl @@ -5,6 +5,9 @@ using Random if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_io_shared.jl b/test/test_io_shared.jl index 621f020de..57758b18c 100644 --- a/test/test_io_shared.jl +++ b/test/test_io_shared.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_io_subarray.jl b/test/test_io_subarray.jl index 3f8f64c8d..8425ffcb2 100644 --- a/test/test_io_subarray.jl +++ b/test/test_io_subarray.jl @@ -5,6 +5,9 @@ using Random if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_onesided.jl b/test/test_onesided.jl index ba0ac7cee..71adcf6f5 100644 --- a/test/test_onesided.jl +++ b/test/test_onesided.jl @@ -1,7 +1,7 @@ using Test using MPI -# TODO: enable CUDA tests once OpenMPI has full support +# TODO: enable CUDA and AMDGPU tests once OpenMPI has full support ArrayType = Array MPI.Init() diff --git a/test/test_reduce.jl b/test/test_reduce.jl index cb971cf13..c1d7fe34c 100644 --- a/test/test_reduce.jl +++ b/test/test_reduce.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_scan.jl b/test/test_scan.jl index 20a6924aa..bd9f3e95e 100644 --- a/test/test_scan.jl +++ b/test/test_scan.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_scatter.jl b/test/test_scatter.jl index d55f7e09d..2f538ea58 100644 --- a/test/test_scatter.jl +++ b/test/test_scatter.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_scatterv.jl b/test/test_scatterv.jl index d30d1b475..1c0fa99f5 100644 --- a/test/test_scatterv.jl +++ b/test/test_scatterv.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_sendrecv.jl b/test/test_sendrecv.jl index 22aea9ee7..c2a0db980 100644 --- a/test/test_sendrecv.jl +++ b/test/test_sendrecv.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_subarray.jl b/test/test_subarray.jl index f197c8eea..a63481e8b 100644 --- a/test/test_subarray.jl +++ b/test/test_subarray.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_test.jl b/test/test_test.jl index ebaa057a9..e39aedc56 100644 --- a/test/test_test.jl +++ b/test/test_test.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_threads.jl b/test/test_threads.jl index 90ecbf40b..55aa41ccf 100644 --- a/test/test_threads.jl +++ b/test/test_threads.jl @@ -4,6 +4,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end diff --git a/test/test_wait.jl b/test/test_wait.jl index 7d292e055..00dcfca7d 100644 --- a/test/test_wait.jl +++ b/test/test_wait.jl @@ -5,6 +5,9 @@ using MPI if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray else ArrayType = Array end From 3dd77fa22f14ecde1004abbc67754e811cee9949 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 00:43:23 +0200 Subject: [PATCH 02/22] Fix tests --- test/runtests.jl | 2 +- test/test_basic.jl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7ec381769..eebec8e50 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,7 @@ if get(ENV, "JULIA_MPI_TEST_ARRAYTYPE", "") == "CuArray" elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" import AMDGPU AMDGPU.versioninfo() - AMDGPU.default_device() # DEBUG: something else may be needed here. + # DEBUG: currently no `precompile_runtime()` functionnality is implemented in AMDGPU.jl. If needed, it could be added by analogy of CUDA; no use of caps in AMDGPU.jl, but https://github.com/JuliaGPU/AMDGPU.jl/blob/cfaade146977594bf18e14b285ee3a9c84fbc7f2/src/execution.jl#L351-L357 shows how to construct a CompilerJob for a given agent. ArrayType = AMDGPU.ROCArray else ArrayType = Array diff --git a/test/test_basic.jl b/test/test_basic.jl index 7792eeed8..7c7538e2a 100644 --- a/test/test_basic.jl +++ b/test/test_basic.jl @@ -19,7 +19,8 @@ MPI.Init() @test MPI.has_cuda() isa Bool -if ArrayType != Array +# DEBUG: a cleaner apporach may be designed +if ArrayType != Array && ArrayType != AMDGPU.ROCArray @test MPI.has_cuda() end From cc46cde16e06fd8a5fd010c411b563502b312a81 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 01:01:53 +0200 Subject: [PATCH 03/22] Fix tests --- test/test_reduce.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_reduce.jl b/test/test_reduce.jl index c1d7fe34c..178d31d95 100644 --- a/test/test_reduce.jl +++ b/test/test_reduce.jl @@ -100,10 +100,11 @@ for T = [Int] # Allocating, Subarray recv_arr = MPI.Reduce(view(send_arr, 2:3), op, MPI.COMM_WORLD; root=root) - if isroot - @test recv_arr isa ArrayType{T} - @test recv_arr == sz .* view(send_arr, 2:3) - end + # DEBUG: currently failing with ROCArray + # if isroot + # @test recv_arr isa ArrayType{T} + # @test recv_arr == sz .* view(send_arr, 2:3) + # end end end end From b9d5811087ddebc9054dbe6257f927841d92b492 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 01:06:44 +0200 Subject: [PATCH 04/22] Fix tests --- test/test_subarray.jl | 3 ++- test/test_threads.jl | 45 ++++++++++++++++++++++--------------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/test/test_subarray.jl b/test/test_subarray.jl index a63481e8b..5262b6b10 100644 --- a/test/test_subarray.jl +++ b/test/test_subarray.jl @@ -37,7 +37,8 @@ src = mod(rank-1, comm_size) MPI.Waitall([req_send, req_recv]) - @test X[3:4,1] == Y + # DEBUG: currently failing with ROCArray + # @test X[3:4,1] == Y end @testset "strided" begin diff --git a/test/test_threads.jl b/test/test_threads.jl index 55aa41ccf..e08f27cbe 100644 --- a/test/test_threads.jl +++ b/test/test_threads.jl @@ -11,35 +11,36 @@ else ArrayType = Array end -provided = MPI.Init(threadlevel=:multiple) +# DEBUG: currently failing +# provided = MPI.Init(threadlevel=:multiple) -@test MPI.THREAD_SINGLE <= provided <= MPI.THREAD_MULTIPLE -@test MPI.Query_thread() == provided -@test MPI.Is_thread_main() +# @test MPI.THREAD_SINGLE <= provided <= MPI.THREAD_MULTIPLE +# @test MPI.Query_thread() == provided +# @test MPI.Is_thread_main() -comm = MPI.COMM_WORLD -size = MPI.Comm_size(comm) -rank = MPI.Comm_rank(comm) +# comm = MPI.COMM_WORLD +# size = MPI.Comm_size(comm) +# rank = MPI.Comm_rank(comm) -const N = 10 +# const N = 10 -dst = mod(rank+1, size) -src = mod(rank-1, size) +# dst = mod(rank+1, size) +# src = mod(rank-1, size) -if provided == MPI.THREAD_MULTIPLE - send_arr = collect(1.0:N) - recv_arr = zeros(N) +# if provided == MPI.THREAD_MULTIPLE +# send_arr = collect(1.0:N) +# recv_arr = zeros(N) - reqs = Array{MPI.Request}(undef, 2N) +# reqs = Array{MPI.Request}(undef, 2N) - Threads.@threads for i = 1:N - reqs[N+i] = MPI.Irecv!(@view(recv_arr[i:i]), comm; source=src, tag=i) - reqs[i] = MPI.Isend(@view(send_arr[i:i]), comm; dest=dst, tag=i) - end +# Threads.@threads for i = 1:N +# reqs[N+i] = MPI.Irecv!(@view(recv_arr[i:i]), comm; source=src, tag=i) +# reqs[i] = MPI.Isend(@view(send_arr[i:i]), comm; dest=dst, tag=i) +# end - MPI.Waitall(reqs) +# MPI.Waitall(reqs) - @test recv_arr == send_arr -end +# @test recv_arr == send_arr +# end -MPI.Finalize() +# MPI.Finalize() From 24c4f48c01811ca9442bb9e25ce27f087d68ed85 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 01:09:45 +0200 Subject: [PATCH 05/22] Update doc --- docs/src/configuration.md | 7 ++++--- docs/src/usage.md | 10 ++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/src/configuration.md b/docs/src/configuration.md index d3d5c4cf2..c3c813ab4 100644 --- a/docs/src/configuration.md +++ b/docs/src/configuration.md @@ -7,7 +7,7 @@ By default, MPI.jl will download and link against the following MPI implementati This is suitable for most single-node use cases, but for larger systems, such as HPC clusters or multi-GPU machines, you will probably want to configure against a system-provided MPI implementation in order to exploit features such as fast network -interfaces and CUDA-aware MPI interfaces. +interfaces and CUDA-aware or ROCm-aware MPI interfaces. ## Julia wrapper for `mpiexec` @@ -190,7 +190,8 @@ julia> MPIPreferences.use_system_binary() The test suite can also be modified by the following variables: - `JULIA_MPI_TEST_NPROCS`: How many ranks to use within the tests -- `JULIA_MPI_TEST_ARRAYTYPE`: Set to `CuArray` to test the CUDA-aware interface with - [`CUDA.CuArray](https://github.com/JuliaGPU/CUDA.jl) buffers. +- `JULIA_MPI_TEST_ARRAYTYPE`: Set to `CuArray` or `ROCArray` to test the CUDA-aware interface with + [`CUDA.CuArray`](https://github.com/JuliaGPU/CUDA.jl) or the ROCm-aware interface with + [`AMDGPU.ROCArray`](https://github.com/JuliaGPU/AMDGPU.jl) or buffers. - `JULIA_MPI_TEST_BINARY`: Check that the specified MPI binary is used for the tests - `JULIA_MPI_TEST_ABI`: Check that the specified MPI ABI is used for the tests diff --git a/docs/src/usage.md b/docs/src/usage.md index 69c235791..e41f4b7f6 100644 --- a/docs/src/usage.md +++ b/docs/src/usage.md @@ -32,8 +32,14 @@ The [`mpiexec`](@ref) function is provided for launching MPI programs from Julia If your MPI implementation has been compiled with CUDA support, then `CUDA.CuArray`s (from the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) package) can be passed directly as -send and receive buffers for point-to-point and collective operations (they may also work -with one-sided operations, but these are not often supported). +send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported). If using Open MPI, the status of CUDA support can be checked via the [`MPI.has_cuda()`](@ref) function. + +## ROCm-aware MPI support + +If your MPI implementation has been compiled with ROCm support (AMDGPU), then `AMDGPU.ROCArray`s (from the +[AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) package) can be passed directly as send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported). + +The status of ROCm (AMDGPU) support cannot currently be queried. From 4782cdf3ea8fffcab05632d468388329d7a7b3f5 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 01:15:07 +0200 Subject: [PATCH 06/22] Add doc update --- docs/src/knownissues.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/src/knownissues.md b/docs/src/knownissues.md index afabf6b08..59f9c0eec 100644 --- a/docs/src/knownissues.md +++ b/docs/src/knownissues.md @@ -97,7 +97,7 @@ _More about CUDA.jl [memory environment-variables](https://cuda.juliagpu.org/sta Make sure to: - Have MPI and CUDA on path (or module loaded) that were used to build the CUDA-aware MPI -- Make sure to have: +- Set the following environment variables: ``` export JULIA_CUDA_MEMORY_POOL=none export JULIA_MPI_BINARY=system @@ -114,6 +114,22 @@ Make sure to: After that, it may be preferred to run the Julia MPI script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/11)) launching it from a shell script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/4)). +## ROCm-aware MPI + +### Hints to ensure ROCm-aware MPI to be functional + +Make sure to: +- Have MPI and ROCm on path (or module loaded) that were used to build the ROCm-aware MPI +- Add AMDGPU and MPI packages in Julia: + ``` + julia -e 'using Pkg; pkg"add AMDGPU"; pkg"add MPI"; using MPI; MPI.use_system_binary()' + ``` +- Then in Julia, upon loading MPI and CUDA modules, you can check + - AMDGPU version: `AMDGPU.versioninfo()` + - If you are using correct MPI implementation: `MPI.identify_implementation()` + +After that, [this script](https://gist.github.com/luraess/c228ec08629737888a18c6a1e397643c) can be used to verify if ROCm-aware MPI is functional (modified after the CUDA-aware version from [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/11)). It may be preferred to run the Julia ROCm-aware MPI script launching it from a shell script (as suggested [here](https://discourse.julialang.org/t/cuda-aware-mpi-works-on-system-but-not-for-julia/75060/4)). + ## Microsoft MPI ### Custom operators on 32-bit Windows From ef153a261ea73b48f6757b47065df8a485890880 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 19 Apr 2022 11:17:19 +0200 Subject: [PATCH 07/22] Update doc with link to rocm scripts --- docs/src/usage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/usage.md b/docs/src/usage.md index e41f4b7f6..d1ff18e31 100644 --- a/docs/src/usage.md +++ b/docs/src/usage.md @@ -42,4 +42,6 @@ If using Open MPI, the status of CUDA support can be checked via the If your MPI implementation has been compiled with ROCm support (AMDGPU), then `AMDGPU.ROCArray`s (from the [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) package) can be passed directly as send and receive buffers for point-to-point and collective operations (they may also work with one-sided operations, but these are not often supported). +Successfully running the [alltoall_test_rocm.jl](https://gist.github.com/luraess/c228ec08629737888a18c6a1e397643c) should confirm your MPI implementation to have the ROCm support (AMDGPU) enabled. Moreover, successfully running the [alltoall_test_rocm_mulitgpu.jl](https://gist.github.com/luraess/d478b3f98eae984931fd39a7158f4b9e) should confirm your ROCm-aware MPI implementation to use multiple AMD GPUs (one GPU per rank). + The status of ROCm (AMDGPU) support cannot currently be queried. From 971a78fa02a3e8dae520e886a7c0c0c4e2793afa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludovic=20R=C3=A4ss?= <61313342+luraess@users.noreply.github.com> Date: Tue, 19 Apr 2022 14:18:49 +0200 Subject: [PATCH 08/22] Add cleaner condition --- test/test_basic.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_basic.jl b/test/test_basic.jl index 7c7538e2a..48bfe6ba6 100644 --- a/test/test_basic.jl +++ b/test/test_basic.jl @@ -19,8 +19,7 @@ MPI.Init() @test MPI.has_cuda() isa Bool -# DEBUG: a cleaner apporach may be designed -if ArrayType != Array && ArrayType != AMDGPU.ROCArray +if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" @test MPI.has_cuda() end From 1ebb7dca40c70f112bce49d8ee92e9220bdc1d34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludovic=20R=C3=A4ss?= <61313342+luraess@users.noreply.github.com> Date: Mon, 2 May 2022 09:39:29 +0200 Subject: [PATCH 09/22] Add ROCm tests --- .buildkite/pipeline.yml | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 80c22deb8..2cb9a4e59 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -161,3 +161,44 @@ steps: echo "+++ Run tests" julia -e 'import Pkg; Pkg.test("MPI")' + + - label: "ROCm -- 1.7" + plugins: + - JuliaCI/julia#v1: + version: "1.7" + persist_depot_dirs: packages,artifacts,compiled + agents: + queue: "juliagpu" + rocm: "*" # todo fix ROCM version + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + env: + JULIA_MPI_TEST_ARRAYTYPE: ROCArray + JULIA_MPI_TEST_NPROCS: 2 + JULIA_MPI_PATH: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 + OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948 + OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user? + soft_fail: true + commands: | + echo "--- Configure MPI" + buildkite-agent artifact download --step "Build OpenMPI -- ROCM" mpi-prefix.tar.gz . + mkdir -p $${JULIA_MPI_PATH} + tar -zxf mpi-prefix.tar.gz --strip-components 1 -C $${JULIA_MPI_PATH} + export PATH=$${JULIA_MPI_PATH}/bin:$${PATH} + export LD_LIBRARY_PATH=$${JULIA_MPI_PATH}/lib:$${LD_LIBRARY_PATH} + + echo "--- Setup Julia packages" + julia --color=yes --project=test -e ' + using Pkg + Pkg.develop(path="lib/MPIPreferences") + using MPIPreferences + MPIPreferences.use_system_binary(export_prefs=true) + rm("test/Manifest.toml")' + julia -e 'import Pkg; Pkg.develop(; path = joinpath(pwd(), "lib", "MPIPreferences"))' + julia -e 'import Pkg; Pkg.develop(; path = pwd())' + julia -e 'import Pkg; Pkg.precompile()' + + echo "+++ Run tests" + julia -e 'import Pkg; Pkg.test("MPI")' From 77e9f2c57494da8a223f4d0f6399ac9d84049e66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludovic=20R=C3=A4ss?= <61313342+luraess@users.noreply.github.com> Date: Mon, 2 May 2022 09:58:36 +0200 Subject: [PATCH 10/22] Update pipeline.yml --- .buildkite/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2cb9a4e59..2ed38392e 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -162,7 +162,7 @@ steps: echo "+++ Run tests" julia -e 'import Pkg; Pkg.test("MPI")' - - label: "ROCm -- 1.7" + - label: "ROCm -- 1.7" plugins: - JuliaCI/julia#v1: version: "1.7" From dc76404266f0f9009c719fff509a60771b49e645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludovic=20R=C3=A4ss?= <61313342+luraess@users.noreply.github.com> Date: Mon, 2 May 2022 17:11:20 +0200 Subject: [PATCH 11/22] Update buildkite ROCm MPI launch params Co-authored-by: Valentin Churavy --- .buildkite/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2ed38392e..89c87253a 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -179,6 +179,7 @@ steps: OMPI_ALLOW_RUN_AS_ROOT: 1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948 + OMPI_MCA_plm_rsh_agent: 'sh' # the container doesn't have ssh installed, but we don't need it OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user? soft_fail: true commands: | From bb5345397403ad858b5e5d72c5a1260a0938df5e Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Sat, 14 May 2022 00:28:01 +0200 Subject: [PATCH 12/22] Uncomment failing tests --- test/test_reduce.jl | 9 ++++----- test/test_subarray.jl | 3 +-- test/test_threads.jl | 45 +++++++++++++++++++++---------------------- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/test/test_reduce.jl b/test/test_reduce.jl index 178d31d95..c1d7fe34c 100644 --- a/test/test_reduce.jl +++ b/test/test_reduce.jl @@ -100,11 +100,10 @@ for T = [Int] # Allocating, Subarray recv_arr = MPI.Reduce(view(send_arr, 2:3), op, MPI.COMM_WORLD; root=root) - # DEBUG: currently failing with ROCArray - # if isroot - # @test recv_arr isa ArrayType{T} - # @test recv_arr == sz .* view(send_arr, 2:3) - # end + if isroot + @test recv_arr isa ArrayType{T} + @test recv_arr == sz .* view(send_arr, 2:3) + end end end end diff --git a/test/test_subarray.jl b/test/test_subarray.jl index 5262b6b10..a63481e8b 100644 --- a/test/test_subarray.jl +++ b/test/test_subarray.jl @@ -37,8 +37,7 @@ src = mod(rank-1, comm_size) MPI.Waitall([req_send, req_recv]) - # DEBUG: currently failing with ROCArray - # @test X[3:4,1] == Y + @test X[3:4,1] == Y end @testset "strided" begin diff --git a/test/test_threads.jl b/test/test_threads.jl index e08f27cbe..55aa41ccf 100644 --- a/test/test_threads.jl +++ b/test/test_threads.jl @@ -11,36 +11,35 @@ else ArrayType = Array end -# DEBUG: currently failing -# provided = MPI.Init(threadlevel=:multiple) +provided = MPI.Init(threadlevel=:multiple) -# @test MPI.THREAD_SINGLE <= provided <= MPI.THREAD_MULTIPLE -# @test MPI.Query_thread() == provided -# @test MPI.Is_thread_main() +@test MPI.THREAD_SINGLE <= provided <= MPI.THREAD_MULTIPLE +@test MPI.Query_thread() == provided +@test MPI.Is_thread_main() -# comm = MPI.COMM_WORLD -# size = MPI.Comm_size(comm) -# rank = MPI.Comm_rank(comm) +comm = MPI.COMM_WORLD +size = MPI.Comm_size(comm) +rank = MPI.Comm_rank(comm) -# const N = 10 +const N = 10 -# dst = mod(rank+1, size) -# src = mod(rank-1, size) +dst = mod(rank+1, size) +src = mod(rank-1, size) -# if provided == MPI.THREAD_MULTIPLE -# send_arr = collect(1.0:N) -# recv_arr = zeros(N) +if provided == MPI.THREAD_MULTIPLE + send_arr = collect(1.0:N) + recv_arr = zeros(N) -# reqs = Array{MPI.Request}(undef, 2N) + reqs = Array{MPI.Request}(undef, 2N) -# Threads.@threads for i = 1:N -# reqs[N+i] = MPI.Irecv!(@view(recv_arr[i:i]), comm; source=src, tag=i) -# reqs[i] = MPI.Isend(@view(send_arr[i:i]), comm; dest=dst, tag=i) -# end + Threads.@threads for i = 1:N + reqs[N+i] = MPI.Irecv!(@view(recv_arr[i:i]), comm; source=src, tag=i) + reqs[i] = MPI.Isend(@view(send_arr[i:i]), comm; dest=dst, tag=i) + end -# MPI.Waitall(reqs) + MPI.Waitall(reqs) -# @test recv_arr == send_arr -# end + @test recv_arr == send_arr +end -# MPI.Finalize() +MPI.Finalize() From bd7d4037a3686b392a9d4fa2df36a216ef9cf060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludovic=20R=C3=A4ss?= <61313342+luraess@users.noreply.github.com> Date: Tue, 17 May 2022 16:19:35 +0200 Subject: [PATCH 13/22] Update CI MPI wrapper --- .github/workflows/UnitTests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index 81a7746fa..21277c698 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -402,7 +402,7 @@ jobs: # - "1.7" # - "nightly" MPIWrapper: - - "2.3.2" + - "2.8.1" fail-fast: false From 10b454cc5b43e1ea5b0d17b358a235b784e9c6eb Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 31 May 2022 10:32:52 +0200 Subject: [PATCH 14/22] Add AMDGPU support to test. --- test/common.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/common.jl b/test/common.jl index e583d5eb7..603991525 100644 --- a/test/common.jl +++ b/test/common.jl @@ -5,6 +5,10 @@ if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" import CUDA ArrayType = CUDA.CuArray synchronize() = CUDA.synchronize() +elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" + import AMDGPU + ArrayType = AMDGPU.ROCArray + synchronize() = nothing else ArrayType = Array synchronize() = nothing From 83cfe024e020309e9540c9f2dc981b4cd279e024 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Wed, 1 Jun 2022 15:50:31 -0700 Subject: [PATCH 15/22] add buildkite script --- .buildkite/pipeline.yml | 136 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index e3835d727..a0f1aac3c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -43,7 +43,7 @@ - "mpi-prefix.tar.gz" - wait - + - label: "Tests -- Julia 1.6" plugins: - JuliaCI/julia#v1: @@ -135,3 +135,137 @@ import Pkg Pkg.test("MPI") ' + + - group: "ROCm" + key: "rocm" + steps: + - label: "Build OpenMPI" + key: "rocm-build-openmpi" + agents: + queue: "juliagpu" + rocm: "*" # todo fix ROCM version + env: + OPENMPI_VER: "4.0" + OPENMPI_VER_FULL: "4.0.3" + UCX_VER: "1.12.1" + CCACHE_DIR: "/root/ccache" + commands: | + echo "--- Install packages" + apt-get install --yes --no-install-recommends curl ccache + export PATH="/usr/lib/ccache/:$$PATH" + echo "--- Build UCX" + curl -L https://github.com/openucx/ucx/releases/download/v$${UCX_VER}/ucx-$${UCX_VER}.tar.gz --output ucx.tar.gz + tar -zxf ucx.tar.gz + pushd ucx-* + ./configure --with-rocm --enable-mt --prefix=$$(realpath ../mpi-prefix) + make -j + make install + popd + echo "--- Build OpenMPI" + curl -L https://download.open-mpi.org/release/open-mpi/v$${OPENMPI_VER}/openmpi-$${OPENMPI_VER_FULL}.tar.gz --output openmpi.tar.gz + tar -zxf openmpi.tar.gz + pushd openmpi-* + ./configure --with-ucx=$$(realpath ../mpi-prefix) --prefix=$$(realpath ../mpi-prefix) + make -j + make install + popd + echo "--- Package prefix" + tar -zcf mpi-prefix.tar.gz mpi-prefix/ + echo "--- ccache stats" + ccache -s + artifact_paths: + - "mpi-prefix.tar.gz" + + - wait + + - label: "Tests -- Julia 1.6" + plugins: + - JuliaCI/julia#v1: + version: "1.6" + persist_depot_dirs: packages,artifacts,compiled + agents: + queue: "juliagpu" + rocm: "*" # todo fix ROCM version + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + env: + JULIA_MPI_TEST_ARRAYTYPE: ROCArray + JULIA_MPI_TEST_NPROCS: 2 + JULIA_MPI_PATH: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 + OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948 + OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user? + JULIA_CUDA_MEMORY_POOL: "none" + commands: | + echo "--- Configure MPI" + buildkite-agent artifact download --step "rocm-build-openmpi" mpi-prefix.tar.gz . + mkdir -p $${JULIA_MPI_PATH} + tar -zxf mpi-prefix.tar.gz --strip-components 1 -C $${JULIA_MPI_PATH} + export PATH=$${JULIA_MPI_PATH}/bin:$${PATH} + export LD_LIBRARY_PATH=$${JULIA_MPI_PATH}/lib:$${LD_LIBRARY_PATH} + + echo "--- Setup Julia packages" + julia --color=yes --project=. -e ' + import Pkg + Pkg.develop(; path = joinpath(pwd(), "lib", "MPIPreferences")) + ' + julia --color=yes --project=test -e ' + using Pkg + Pkg.develop(path="lib/MPIPreferences") + using MPIPreferences + MPIPreferences.use_system_binary(export_prefs=true) + rm("test/Manifest.toml") + ' + + echo "+++ Run tests" + julia --color=yes --project=. -e ' + import Pkg + Pkg.test("MPI") + ' + + - label: "Tests -- Julia 1.7" + plugins: + - JuliaCI/julia#v1: + version: "1.7" + persist_depot_dirs: packages,artifacts,compiled + agents: + queue: "juliagpu" + rocm: "*" # todo fix ROCM version + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + env: + JULIA_MPI_TEST_ARRAYTYPE: ROCArray + JULIA_MPI_TEST_NPROCS: 2 + JULIA_MPI_PATH: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 + OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948 + OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user? + JULIA_CUDA_MEMORY_POOL: "none" + commands: | + echo "--- Configure MPI" + buildkite-agent artifact download --step "rocm-build-openmpi" mpi-prefix.tar.gz . + mkdir -p $${JULIA_MPI_PATH} + tar -zxf mpi-prefix.tar.gz --strip-components 1 -C $${JULIA_MPI_PATH} + export PATH=$${JULIA_MPI_PATH}/bin:$${PATH} + export LD_LIBRARY_PATH=$${JULIA_MPI_PATH}/lib:$${LD_LIBRARY_PATH} + + echo "--- Setup Julia packages" + julia --color=yes --project=. -e ' + import Pkg + Pkg.develop(; path = joinpath(pwd(), "lib", "MPIPreferences")) + ' + julia --color=yes --project=test -e ' + using Pkg + Pkg.develop(path="lib/MPIPreferences") + using MPIPreferences + MPIPreferences.use_system_binary(export_prefs=true) + rm("test/Manifest.toml") + ' + + echo "+++ Run tests" + julia --color=yes --project=. -e ' + import Pkg + Pkg.test("MPI") + ' From fa73ba2657c5c7e2858345704d3711043d5823d9 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Wed, 1 Jun 2022 16:07:39 -0700 Subject: [PATCH 16/22] use latest Open MPI --- .buildkite/pipeline.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a0f1aac3c..93074cd51 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -7,8 +7,8 @@ queue: "juliagpu" cuda: "11.0" env: - OPENMPI_VER: "4.0" - OPENMPI_VER_FULL: "4.0.3" + OPENMPI_VER: "4.1" + OPENMPI_VER_FULL: "4.1.4" UCX_VER: "1.12.1" CCACHE_DIR: "/root/ccache" commands: | @@ -145,8 +145,8 @@ queue: "juliagpu" rocm: "*" # todo fix ROCM version env: - OPENMPI_VER: "4.0" - OPENMPI_VER_FULL: "4.0.3" + OPENMPI_VER: "4.1" + OPENMPI_VER_FULL: "4.1.4" UCX_VER: "1.12.1" CCACHE_DIR: "/root/ccache" commands: | From 3426a3a925e1a477172041a3a66c5ad51a8c3210 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Wed, 1 Jun 2022 16:27:02 -0700 Subject: [PATCH 17/22] disable AMDGPU julia 1.6 --- .buildkite/pipeline.yml | 46 ----------------------------------------- 1 file changed, 46 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 93074cd51..6e449f6b3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -178,52 +178,6 @@ - wait - - label: "Tests -- Julia 1.6" - plugins: - - JuliaCI/julia#v1: - version: "1.6" - persist_depot_dirs: packages,artifacts,compiled - agents: - queue: "juliagpu" - rocm: "*" # todo fix ROCM version - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 - env: - JULIA_MPI_TEST_ARRAYTYPE: ROCArray - JULIA_MPI_TEST_NPROCS: 2 - JULIA_MPI_PATH: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" - OMPI_ALLOW_RUN_AS_ROOT: 1 - OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 - OMPI_MCA_btl_vader_single_copy_mechanism: 'none' # https://github.com/open-mpi/ompi/issues/4948 - OPAL_PREFIX: "${BUILDKITE_BUILD_CHECKOUT_PATH}/openmpi" # Should we set this for the user? - JULIA_CUDA_MEMORY_POOL: "none" - commands: | - echo "--- Configure MPI" - buildkite-agent artifact download --step "rocm-build-openmpi" mpi-prefix.tar.gz . - mkdir -p $${JULIA_MPI_PATH} - tar -zxf mpi-prefix.tar.gz --strip-components 1 -C $${JULIA_MPI_PATH} - export PATH=$${JULIA_MPI_PATH}/bin:$${PATH} - export LD_LIBRARY_PATH=$${JULIA_MPI_PATH}/lib:$${LD_LIBRARY_PATH} - - echo "--- Setup Julia packages" - julia --color=yes --project=. -e ' - import Pkg - Pkg.develop(; path = joinpath(pwd(), "lib", "MPIPreferences")) - ' - julia --color=yes --project=test -e ' - using Pkg - Pkg.develop(path="lib/MPIPreferences") - using MPIPreferences - MPIPreferences.use_system_binary(export_prefs=true) - rm("test/Manifest.toml") - ' - - echo "+++ Run tests" - julia --color=yes --project=. -e ' - import Pkg - Pkg.test("MPI") - ' - - label: "Tests -- Julia 1.7" plugins: - JuliaCI/julia#v1: From 27bb633b8b304f53c641f8d301ddd3ef335d1612 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 2 Jun 2022 06:54:35 -0700 Subject: [PATCH 18/22] try UCX 1.13-rc1 --- .buildkite/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 6e449f6b3..d5da66782 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -147,14 +147,14 @@ env: OPENMPI_VER: "4.1" OPENMPI_VER_FULL: "4.1.4" - UCX_VER: "1.12.1" + UCX_VER: "1.13-rc1" CCACHE_DIR: "/root/ccache" commands: | echo "--- Install packages" apt-get install --yes --no-install-recommends curl ccache export PATH="/usr/lib/ccache/:$$PATH" echo "--- Build UCX" - curl -L https://github.com/openucx/ucx/releases/download/v$${UCX_VER}/ucx-$${UCX_VER}.tar.gz --output ucx.tar.gz + curl -L https://github.com/openucx/ucx/releases/download/v1.13.0-rc1/ucx-1.13.0.tar.gz --output ucx.tar.gz tar -zxf ucx.tar.gz pushd ucx-* ./configure --with-rocm --enable-mt --prefix=$$(realpath ../mpi-prefix) From 83fe889b3d6d5d1c0484776c52daf1f923c850bc Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 2 Jun 2022 09:11:52 -0700 Subject: [PATCH 19/22] Add synchronize Co-authored-by: Valentin Churavy --- test/common.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/common.jl b/test/common.jl index 603991525..63ce40efc 100644 --- a/test/common.jl +++ b/test/common.jl @@ -8,7 +8,13 @@ if get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "CuArray" elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" import AMDGPU ArrayType = AMDGPU.ROCArray - synchronize() = nothing + function synchronize() + # TODO: AMDGPU synchronization story is complicated. HSA does not provide a consistent notion of global queues. We need a mechanism for all GPUArrays.jl provided kernels to be synchronized. + queue = AMDGPU.get_default_queue() + barrier = AMDGPU.barrier_and!(queue, AMDGPU.active_kernels(queue)) + AMDGPU.hipDeviceSynchronize() # Sync all HIP kernels e.g. BLAS. N.B. this is blocking Julia progress + wait(barrier) + end else ArrayType = Array synchronize() = nothing From 28edee49a42a9e49134872e6a99fcf30b2f09e4f Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 2 Jun 2022 09:47:17 -0700 Subject: [PATCH 20/22] Update test/common.jl --- test/common.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/common.jl b/test/common.jl index 63ce40efc..9133d6c4b 100644 --- a/test/common.jl +++ b/test/common.jl @@ -12,7 +12,7 @@ elseif get(ENV,"JULIA_MPI_TEST_ARRAYTYPE","") == "ROCArray" # TODO: AMDGPU synchronization story is complicated. HSA does not provide a consistent notion of global queues. We need a mechanism for all GPUArrays.jl provided kernels to be synchronized. queue = AMDGPU.get_default_queue() barrier = AMDGPU.barrier_and!(queue, AMDGPU.active_kernels(queue)) - AMDGPU.hipDeviceSynchronize() # Sync all HIP kernels e.g. BLAS. N.B. this is blocking Julia progress + AMDGPU.HIP.hipDeviceSynchronize() # Sync all HIP kernels e.g. BLAS. N.B. this is blocking Julia progress wait(barrier) end else From 5fd4180e4a3aa24cf97f6c66ef5eb9069de8e552 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 2 Jun 2022 15:25:35 -0700 Subject: [PATCH 21/22] add more synchronize() --- test/test_reduce.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_reduce.jl b/test/test_reduce.jl index 3b3a4f927..8c64a601e 100644 --- a/test/test_reduce.jl +++ b/test/test_reduce.jl @@ -36,6 +36,7 @@ val = isroot ? sz : nothing @test MPI.Reduce(1, +, root, comm) == val mesg = ArrayType(1.0:5.0) +synchronize() sum_mesg = MPI.Reduce(mesg, +, comm; root=root) if isroot @test sum_mesg isa ArrayType{Float64} @@ -52,6 +53,7 @@ for T = [Int] for dims = [1, 2, 3] send_arr = ArrayType(zeros(T, Tuple(3 for i in 1:dims))) send_arr[:] .= 1:length(send_arr) + synchronize() for op in operators From 727c8ea6c19e45031bf67ceb410f2221ebddedda Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 2 Jun 2022 22:18:49 -0700 Subject: [PATCH 22/22] modify conversion to MPIPtr --- src/rocm.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rocm.jl b/src/rocm.jl index 3a9fd913e..60534113f 100644 --- a/src/rocm.jl +++ b/src/rocm.jl @@ -1,11 +1,11 @@ import .AMDGPU function Base.cconvert(::Type{MPIPtr}, A::AMDGPU.ROCArray{T}) where T - Base.cconvert(Ptr{T}, A.buf.ptr) # returns DeviceBuffer + A end function Base.unsafe_convert(::Type{MPIPtr}, X::AMDGPU.ROCArray{T}) where T - reinterpret(MPIPtr, Base.unsafe_convert(Ptr{T}, X.buf.ptr)) + reinterpret(MPIPtr, Base.unsafe_convert(Ptr{T}, X.buf.ptr+X.offset)) end # only need to define this for strided arrays: all others can be handled by generic machinery