From 3b8c449f17d058d79733fb721d2a1d09a247cae0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 1 Nov 2023 13:30:36 -0400 Subject: [PATCH 01/30] Remove empty quotation marks for static_assert --- .../unit_tests/TestStdAlgorithmsModOps.cpp | 4 +- .../TestStdAlgorithmsPartitionCopy.cpp | 6 +- containers/src/Kokkos_DynRankView.hpp | 2 +- core/src/Cuda/Kokkos_CudaSpace.hpp | 11 +- core/src/HIP/Kokkos_HIP_Space.hpp | 3 +- core/src/KokkosExp_MDRangePolicy.hpp | 4 +- core/src/Kokkos_HBWSpace.hpp | 7 +- core/src/Kokkos_HostSpace.hpp | 3 +- core/src/Kokkos_MathematicalFunctions.hpp | 2 +- core/src/Kokkos_View.hpp | 10 +- core/src/SYCL/Kokkos_SYCL_Space.hpp | 15 +- core/src/impl/Kokkos_ViewArray.hpp | 7 +- core/src/impl/Kokkos_ViewMapping.hpp | 28 +- core/src/traits/Kokkos_IndexTypeTrait.hpp | 2 +- .../traits/Kokkos_OccupancyControlTrait.hpp | 2 +- core/src/traits/Kokkos_PolicyTraitAdaptor.hpp | 4 +- core/src/traits/Kokkos_ScheduleTrait.hpp | 2 +- .../traits/Kokkos_WorkItemPropertyTrait.hpp | 2 +- core/unit_test/TestAggregate.hpp | 30 +- core/unit_test/TestComplex.hpp | 20 +- core/unit_test/TestConcepts.hpp | 68 ++--- core/unit_test/TestFunctorAnalysis.hpp | 58 ++-- .../TestHostSharedPtrAccessOnDevice.hpp | 2 +- .../TestJoinBackwardCompatibility.hpp | 5 +- core/unit_test/TestMathematicalFunctions.hpp | 52 ++-- core/unit_test/TestNumericTraits.hpp | 101 +++---- core/unit_test/TestTeamBasic.hpp | 2 +- core/unit_test/TestUtilities.hpp | 18 +- core/unit_test/TestViewAPI.hpp | 3 +- core/unit_test/TestViewMapping_a.hpp | 277 ++++++++---------- core/unit_test/TestViewMapping_b.hpp | 14 +- core/unit_test/cuda/TestCuda_Spaces.cpp | 166 +++++------ .../default/TestDefaultDeviceType.cpp | 9 +- core/unit_test/hip/TestHIP_Spaces.cpp | 150 ++++------ core/unit_test/sycl/TestSYCL_Spaces.cpp | 195 +++++------- core/unit_test/tools/TestProfilingSection.cpp | 10 +- 36 files changed, 555 insertions(+), 739 deletions(-) diff --git a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4604764097e..c0130885dc5 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); // move constr MyMovableType b(std::move(a)); @@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove { void operator()(const int index) const { typename ViewType::value_type a{11}; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); m_view(index) = std::move(a); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index f169fd9ce88..a36c9db2b9e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); const std::size_t ext = view_from.extent(0); diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 52aa86d8ee4..33d9562ea4f 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1340,7 +1340,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits& cuda_get_deep_copy_space( bool initialize = true); static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); //---------------------------------------- diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e5..df03785b112 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -239,8 +239,7 @@ struct Impl::is_hip_type_space : public std::true_type {}; namespace Kokkos { namespace Impl { -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); //---------------------------------------- diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index c9080db01ca..d0ae7fdcea5 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -96,7 +96,7 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { using T = typename Array::value_type; Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); auto* ptr = a.data(); // NOTE equivalent to // std::transform(std::begin(init), std::end(init), a.data(), @@ -120,7 +120,7 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; NVCC_WONT_LET_ME_CALL_YOU_Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { a[i] = checked_narrow_cast(other[i]); (void)checked_narrow_cast(other[i]); // see note above diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp index 369b7bafb7b..56fe607b349 100644 --- a/core/src/Kokkos_HBWSpace.hpp +++ b/core/src/Kokkos_HBWSpace.hpp @@ -188,10 +188,9 @@ namespace Kokkos { namespace Impl { -static_assert( - Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HBWSpace, + Kokkos::Experimental::HBWSpace>::assignable); template <> struct MemorySpaceAccess { diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 90d14040637..c20bb1abc60 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -129,8 +129,7 @@ namespace Kokkos { namespace Impl { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); template struct HostMirror { diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index 361d1317e94..3fead8dd293 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -198,7 +198,7 @@ using promote_3_t = typename promote_3::type; long double> \ FUNC(T1 x, T2 y) { \ using Promoted = Kokkos::Impl::promote_2_t; \ - static_assert(std::is_same_v, ""); \ + static_assert(std::is_same_v); \ using std::FUNC; \ return FUNC(static_cast(x), static_cast(y)); \ } diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index bcbb28014cd..1d2b4b9be2c 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -814,15 +814,15 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is), ""); - static_assert(sizeof...(Is) <= 8, ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is), ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); } public: diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0b..252391b2f2c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -166,19 +166,16 @@ struct is_sycl_type_space : public std::true_type {}; static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); template <> struct MemorySpaceAccess> { private: using array_analysis = ViewArrayAnalysis; - static_assert(std::is_void

::value, ""); + static_assert(std::is_void

::value); static_assert(std::is_same>::value, - ""); + Kokkos::Array>::value); static_assert(std::is_scalar::value, "View of Array type must be of a scalar type"); @@ -507,7 +506,7 @@ class ViewMapping< Kokkos::LayoutStride>::value))>, SrcTraits, Args...> { private: - static_assert(SrcTraits::rank == sizeof...(Args), ""); + static_assert(SrcTraits::rank == sizeof...(Args)); enum : bool { R0 = is_integral_extent<0, Args...>::value, diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 01d0dc4f681..16ca33a87d0 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -657,21 +657,20 @@ struct SubviewExtents { template KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension& dim, Args... args) { - static_assert(DomainRank == sizeof...(DimArgs), ""); - static_assert(DomainRank == sizeof...(Args), ""); + static_assert(DomainRank == sizeof...(DimArgs)); + static_assert(DomainRank == sizeof...(Args)); // Verifies that all arguments, up to 8, are integral types, // integral extents, or don't exist. - static_assert( - RangeRank == unsigned(is_integral_extent<0, Args...>::value) + - unsigned(is_integral_extent<1, Args...>::value) + - unsigned(is_integral_extent<2, Args...>::value) + - unsigned(is_integral_extent<3, Args...>::value) + - unsigned(is_integral_extent<4, Args...>::value) + - unsigned(is_integral_extent<5, Args...>::value) + - unsigned(is_integral_extent<6, Args...>::value) + - unsigned(is_integral_extent<7, Args...>::value), - ""); + static_assert(RangeRank == + unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value)); if (RangeRank == 0) { m_length[0] = 0; @@ -814,8 +813,7 @@ struct ViewDataAnalysis { // Must match array analysis when this default template is used. static_assert( std::is_same::value, - ""); + typename array_analysis::non_const_value_type>::value); public: using specialize = void; // No specialization @@ -3896,7 +3894,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits::value, ""); + static_assert(std::is_integral::value); static constexpr bool index_type_is_defaulted = false; using index_type = Kokkos::IndexType; }; diff --git a/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/core/src/traits/Kokkos_OccupancyControlTrait.hpp index dadf582c372..c2ca5a341f1 100644 --- a/core/src/traits/Kokkos_OccupancyControlTrait.hpp +++ b/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -163,7 +163,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) { template constexpr auto prefer(Policy const& p, MaximizeOccupancy) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::OccupancyControlTrait::policy_with_trait; diff --git a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp index 578e9e762ad..98ad1d7ebbb 100644 --- a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp +++ b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl< TraitSpec, PolicyTemplate, type_list, type_list, NewTrait, std::enable_if_t::value>> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; @@ -92,7 +92,7 @@ template class PolicyTemplate, struct PolicyTraitAdaptorImpl, type_list<>, NewTrait> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; diff --git a/core/src/traits/Kokkos_ScheduleTrait.hpp b/core/src/traits/Kokkos_ScheduleTrait.hpp index 86130025530..4e91d89f0f9 100644 --- a/core/src/traits/Kokkos_ScheduleTrait.hpp +++ b/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -78,7 +78,7 @@ namespace Experimental { template constexpr auto require(Policy const& p, Kokkos::Schedule) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< Policy, Kokkos::Schedule>; return new_policy_t{p}; diff --git a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp index 8f95385c851..ae7aa6e534f 100644 --- a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp +++ b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -57,7 +57,7 @@ namespace Experimental { template constexpr auto require(const Policy p, WorkItemProperty::ImplWorkItemProperty) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< Policy, WorkItemProperty::ImplWorkItemProperty>; return new_policy_t{p}; diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp index 4f67b2eddce..f1316a7426a 100644 --- a/core/unit_test/TestAggregate.hpp +++ b/core/unit_test/TestAggregate.hpp @@ -29,35 +29,31 @@ void TestViewAggregate() { value_type>; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); using a32_traits = Kokkos::ViewTraits; using flat_traits = Kokkos::ViewTraits; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); - static_assert(a32_traits::rank == 2, ""); - static_assert(a32_traits::rank_dynamic == 2, ""); + std::is_same::value); + static_assert(a32_traits::rank == 2); + static_assert(a32_traits::rank_dynamic == 2); - static_assert(std::is_void::value, ""); - static_assert(flat_traits::rank == 3, ""); - static_assert(flat_traits::rank_dynamic == 2, ""); - static_assert(flat_traits::dimension::N2 == 32, ""); + static_assert(std::is_void::value); + static_assert(flat_traits::rank == 3); + static_assert(flat_traits::rank_dynamic == 2); + static_assert(flat_traits::dimension::N2 == 32); using a32_type = Kokkos::View **, DeviceType>; using a32_flat_type = typename a32_type::array_type; - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(a32_type::rank == 2, ""); - static_assert(a32_flat_type::rank == 3, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(a32_type::rank == 2); + static_assert(a32_flat_type::rank == 3); a32_type x("test", 4, 5); a32_flat_type y(x); diff --git a/core/unit_test/TestComplex.hpp b/core/unit_test/TestComplex.hpp index bcae2e1d816..5501a35b7f0 100644 --- a/core/unit_test/TestComplex.hpp +++ b/core/unit_test/TestComplex.hpp @@ -451,17 +451,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) { ASSERT_FLOAT_EQ(x.real(), y.real()); ASSERT_FLOAT_EQ(x.imag(), y.imag()); -#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex, long double, Kokkos::complex); diff --git a/core/unit_test/TestConcepts.hpp b/core/unit_test/TestConcepts.hpp index 476a8848325..b85867bf63a 100644 --- a/core/unit_test/TestConcepts.hpp +++ b/core/unit_test/TestConcepts.hpp @@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; using DeviceType = typename ExecutionSpace::device_type; -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); - -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); - -static_assert(Kokkos::is_device{}, ""); -static_assert(Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); - -static_assert(Kokkos::is_execution_space_v, ""); -static_assert(!Kokkos::is_execution_space_v, ""); +static_assert(Kokkos::is_execution_space{}); +static_assert(Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); + +static_assert(Kokkos::is_memory_space{}); +static_assert(Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); + +static_assert(Kokkos::is_device{}); +static_assert(Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); + +static_assert(Kokkos::is_execution_space_v); +static_assert(!Kokkos::is_execution_space_v); static_assert( - std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); + std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); /*------------------------------------------------- begin test for team_handle concept diff --git a/core/unit_test/TestFunctorAnalysis.hpp b/core/unit_test/TestFunctorAnalysis.hpp index c024526111b..e58324144e4 100644 --- a/core/unit_test/TestFunctorAnalysis.hpp +++ b/core/unit_test/TestFunctorAnalysis.hpp @@ -59,16 +59,15 @@ void test_functor_analysis() { using R01 = typename A01::Reducer; - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_same::value, - ""); - - static_assert(!A01::has_join_member_function, ""); - static_assert(!A01::has_init_member_function, ""); - static_assert(!A01::has_final_member_function, ""); - static_assert(A01::StaticValueSize == 0, ""); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_same::value); + + static_assert(!A01::has_join_member_function); + static_assert(!A01::has_init_member_function); + static_assert(!A01::has_final_member_function); + static_assert(A01::StaticValueSize == 0); ASSERT_EQ(R01(c01).length(), 0); //------------------------------ @@ -78,16 +77,15 @@ void test_functor_analysis() { Kokkos::RangePolicy, decltype(c02), void>; using R02 = typename A02::Reducer; - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); - static_assert(!A02::has_join_member_function, ""); - static_assert(!A02::has_init_member_function, ""); - static_assert(!A02::has_final_member_function, ""); - static_assert(A02::StaticValueSize == sizeof(double), ""); + static_assert(!A02::has_join_member_function); + static_assert(!A02::has_init_member_function); + static_assert(!A02::has_final_member_function); + static_assert(A02::StaticValueSize == sizeof(double)); ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -99,23 +97,19 @@ void test_functor_analysis() { using R03 = typename A03::Reducer; static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type*>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type&>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); - static_assert(A03::has_join_member_function, ""); - static_assert(A03::has_init_member_function, ""); - static_assert(!A03::has_final_member_function, ""); - static_assert( - A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + static_assert(A03::has_join_member_function); + static_assert(A03::has_init_member_function); + static_assert(!A03::has_final_member_function); + static_assert(A03::StaticValueSize == + sizeof(TestFunctorAnalysis_03::value_type)); ASSERT_EQ(R03(c03).length(), 1); //------------------------------ diff --git a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp index 3ee2ff52051..467b9ad157f 100644 --- a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp +++ b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -37,7 +37,7 @@ template struct CheckAccessStoredPointerAndDereferenceOnDevice { SmartPtr m_device_ptr; using ElementType = typename SmartPtr::element_type; - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { diff --git a/core/unit_test/TestJoinBackwardCompatibility.hpp b/core/unit_test/TestJoinBackwardCompatibility.hpp index 24cf52aa709..efe4a2307a8 100644 --- a/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, } static_assert((no_error | error_operator_plus_equal_volatile) == - error_operator_plus_equal_volatile, - ""); -static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, ""); + error_operator_plus_equal_volatile); +static_assert((error_join_volatile | error_operator_plus_equal) == 0b101); struct MyJoinBackCompatValueType { MyErrorCode err = no_error; diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index d7390172b68..4d203ead75f 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -1315,19 +1315,17 @@ struct TestAbsoluteValueFunction { Kokkos::printf("failed abs(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); static_assert(std::is_same(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; @@ -1451,17 +1449,14 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_assert(std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); #endif } }; @@ -1530,19 +1525,16 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_assert( std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert( std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS static_assert( - std::is_same::value, ""); + std::is_same::value); #endif } }; @@ -1622,11 +1614,11 @@ struct TestIsNaN { Kokkos::printf("failed isnan(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index aa11f21dd1f..3c159ebb341 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -407,7 +407,7 @@ struct HasNoSpecialization {}; using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT::value); \ template \ using has_##TRAIT = Kokkos::is_detected; \ - static_assert(!has_##TRAIT::value, ""); + static_assert(!has_##TRAIT::value); CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity) CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min) @@ -489,39 +489,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif // clang-format off -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min(), ""); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min()); // integer types -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // floating point types -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // clang-format on CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits); @@ -588,15 +588,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10); #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT -#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ - static_assert( \ - std::numeric_limits::TRAIT() != std::numeric_limits::TRAIT(), ""); \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - std::numeric_limits::TRAIT(), \ - "") +#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + Kokkos::Experimental::TRAIT::value); \ + static_assert(std::numeric_limits::TRAIT() != \ + std::numeric_limits::TRAIT()); \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + std::numeric_limits::TRAIT()) // Workaround compiler issue error: expression must have a constant value // See kokkos/kokkos#4574 @@ -616,14 +614,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN); #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -671,17 +666,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10); #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ diff --git a/core/unit_test/TestTeamBasic.hpp b/core/unit_test/TestTeamBasic.hpp index c395bc0837c..a3d84c5e16b 100644 --- a/core/unit_test/TestTeamBasic.hpp +++ b/core/unit_test/TestTeamBasic.hpp @@ -280,7 +280,7 @@ namespace Test { // Test for non-arithmetic type TEST(TEST_CATEGORY, team_broadcast_long_wrapper) { - static_assert(!std::is_arithmetic::value, ""); + static_assert(!std::is_arithmetic::value); TestTeamBroadcast, long_wrapper>::test_teambroadcast(0, 1); diff --git a/core/unit_test/TestUtilities.hpp b/core/unit_test/TestUtilities.hpp index b1f9d30c1fc..ad5a0df92de 100644 --- a/core/unit_test/TestUtilities.hpp +++ b/core/unit_test/TestUtilities.hpp @@ -25,20 +25,18 @@ namespace Test { void test_is_specialization_of() { using Kokkos::Impl::is_specialization_of; - static_assert(is_specialization_of, Kokkos::pair>{}, - ""); - static_assert(!is_specialization_of, Kokkos::pair>{}, ""); - static_assert(is_specialization_of, Kokkos::View>{}, ""); + static_assert(is_specialization_of, Kokkos::pair>{}); + static_assert(!is_specialization_of, Kokkos::pair>{}); + static_assert(is_specialization_of, Kokkos::View>{}); // NOTE Not removing cv-qualifiers - static_assert(!is_specialization_of const, Kokkos::View>{}, - ""); + static_assert( + !is_specialization_of const, Kokkos::View>{}); // NOTE Would not compile because Kokkos::Array takes a non-type template // parameter - // static_assert(is_specialization_of, Kokkos::Array>{}, - // ""); + // static_assert(is_specialization_of, + // Kokkos::Array>{}); // But this is fine of course - static_assert(!is_specialization_of, Kokkos::pair>{}, - ""); + static_assert(!is_specialization_of, Kokkos::pair>{}); } namespace { diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index ffc500e4a9a..4c27695f6d1 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -958,8 +958,7 @@ class TestViewAPI { using mirror_type = typename view_type::HostMirror; static_assert(std::is_same::value, - ""); + typename mirror_type::memory_space>::value); view_type a("a"); mirror_type am = Kokkos::create_mirror_view(a); diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index 9173f0d4316..a4dfdb26e3f 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -73,67 +73,67 @@ void test_view_mapping() { ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); #endif - static_assert(int(dim_0::rank) == int(0), ""); - static_assert(int(dim_0::rank_dynamic) == int(0), ""); - static_assert(int(dim_0::ArgN0) == 1, ""); - static_assert(int(dim_0::ArgN1) == 1, ""); - static_assert(int(dim_0::ArgN2) == 1, ""); - - static_assert(int(dim_s2::rank) == int(1), ""); - static_assert(int(dim_s2::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2::ArgN0) == 2, ""); - static_assert(int(dim_s2::ArgN1) == 1, ""); - - static_assert(int(dim_s2_s3::rank) == int(2), ""); - static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3::ArgN2) == 1, ""); - - static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); - static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); - - static_assert(int(dim_s0::rank) == int(1), ""); - static_assert(int(dim_s0::rank_dynamic) == int(1), ""); - - static_assert(int(dim_s0_s3::rank) == int(2), ""); - static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3::ArgN1) == 3, ""); - - static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); - static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); - - static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); - static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); - static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + static_assert(int(dim_0::rank) == int(0)); + static_assert(int(dim_0::rank_dynamic) == int(0)); + static_assert(int(dim_0::ArgN0) == 1); + static_assert(int(dim_0::ArgN1) == 1); + static_assert(int(dim_0::ArgN2) == 1); + + static_assert(int(dim_s2::rank) == int(1)); + static_assert(int(dim_s2::rank_dynamic) == int(0)); + static_assert(int(dim_s2::ArgN0) == 2); + static_assert(int(dim_s2::ArgN1) == 1); + + static_assert(int(dim_s2_s3::rank) == int(2)); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3::ArgN0) == 2); + static_assert(int(dim_s2_s3::ArgN1) == 3); + static_assert(int(dim_s2_s3::ArgN2) == 1); + + static_assert(int(dim_s2_s3_s4::rank) == int(3)); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1); + + static_assert(int(dim_s0::rank) == int(1)); + static_assert(int(dim_s0::rank_dynamic) == int(1)); + + static_assert(int(dim_s0_s3::rank) == int(2)); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3::ArgN0) == 0); + static_assert(int(dim_s0_s3::ArgN1) == 3); + + static_assert(int(dim_s0_s3_s4::rank) == int(3)); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s4::rank) == int(3)); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2)); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s0::rank) == int(3)); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3)); + + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4)); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4)); + + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5)); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8)); dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); @@ -514,11 +514,11 @@ void test_view_mapping() { { using namespace Kokkos::Impl; - static_assert(rank_dynamic<>::value == 0, ""); - static_assert(rank_dynamic<1>::value == 0, ""); - static_assert(rank_dynamic<0>::value == 1, ""); - static_assert(rank_dynamic<0, 1>::value == 1, ""); - static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + static_assert(rank_dynamic<>::value == 0); + static_assert(rank_dynamic<1>::value == 0); + static_assert(rank_dynamic<0>::value == 1); + static_assert(rank_dynamic<0, 1>::value == 1); + static_assert(rank_dynamic<0, 0, 1>::value == 2); } { @@ -529,54 +529,48 @@ void test_view_mapping() { using a_const_int_r1 = ViewArrayAnalysis; using a_const_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r1::dimension::rank == 1, ""); - static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_int_r1::dimension::rank == 1); + static_assert(a_int_r1::dimension::rank_dynamic == 1); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 4); + static_assert(a_int_r5::dimension::ArgN3 == 5); + static_assert(a_int_r5::dimension::ArgN4 == 6); + static_assert(a_int_r5::dimension::ArgN5 == 1); static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); - static_assert(a_const_int_r1::dimension::rank == 1, ""); - static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(a_const_int_r1::dimension::rank == 1); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0> >::value); + static_assert(std::is_same::value); - static_assert(a_const_int_r5::dimension::rank == 5, ""); - static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_const_int_r5::dimension::rank == 5); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2); - static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_const_int_r5::dimension::ArgN0 == 0); + static_assert(a_const_int_r5::dimension::ArgN1 == 0); + static_assert(a_const_int_r5::dimension::ArgN2 == 4); + static_assert(a_const_int_r5::dimension::ArgN3 == 5); + static_assert(a_const_int_r5::dimension::ArgN4 == 6); + static_assert(a_const_int_r5::dimension::ArgN5 == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); + static_assert(std::is_same::value); - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 2); static_assert(std::is_same >::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -587,15 +581,15 @@ void test_view_mapping() { // Dimensions of t_i4 are appended to the multdimensional array. using a_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 0, ""); - static_assert(a_int_r5::dimension::ArgN3 == 3, ""); - static_assert(a_int_r5::dimension::ArgN4 == 4, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 3); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 0); + static_assert(a_int_r5::dimension::ArgN3 == 3); + static_assert(a_int_r5::dimension::ArgN4 == 4); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -603,71 +597,54 @@ void test_view_mapping() { using a_const_int_r1 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, ""); - static_assert( - std::is_same::value, - ""); + std::is_same::value); + static_assert(std::is_same::value); using a_const_int_r3 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0, 0, 4> >::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); - static_assert( - std::is_same::value, - ""); + int* * [4]>::value); + static_assert(std::is_same::value); static_assert( std::is_same::value, - ""); + int* * [4]>::value); // std::cout << "typeid( const int**[4] ).name() = " << typeid( const // int**[4] ).name() << std::endl; diff --git a/core/unit_test/TestViewMapping_b.hpp b/core/unit_test/TestViewMapping_b.hpp index 9ac4e7da845..4aee035d17a 100644 --- a/core/unit_test/TestViewMapping_b.hpp +++ b/core/unit_test/TestViewMapping_b.hpp @@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } { // Assignment of rank-2 Right = Left @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } } @@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { using src_traits = dst_traits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(std::is_trivially_copyable{}, ""); + static_assert(std::is_trivially_copyable{}); } } // namespace Test diff --git a/core/unit_test/cuda/TestCuda_Spaces.cpp b/core/unit_test/cuda/TestCuda_Spaces.cpp index ae603101abb..11fe6b8555b 100644 --- a/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -29,200 +29,166 @@ __global__ void test_cuda_spaces_int_value(int *ptr) { TEST(cuda, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert( - !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaSpace>::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( std::is_same::Space, - Kokkos::CudaHostPinnedSpace>::value, - ""); + Kokkos::CudaHostPinnedSpace>::value); static_assert(std::is_same, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); - static_assert( - Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert(Kokkos::SpaceAccessibility< + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); #ifdef KOKKOS_ENABLE_CUDA_UVM using uvm_view = Kokkos::View; static_assert(std::is_same::Space; static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); } } // namespace Test diff --git a/core/unit_test/hip/TestHIP_Spaces.cpp b/core/unit_test/hip/TestHIP_Spaces.cpp index 14fd4e28837..8f7499c244b 100644 --- a/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/core/unit_test/hip/TestHIP_Spaces.cpp @@ -29,198 +29,164 @@ __global__ void test_hip_spaces_int_value(int *ptr) { TEST(hip, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, - Kokkos::HIPHostPinnedSpace>::value, - ""); + Kokkos::HIPHostPinnedSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::HIPManagedSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); } template diff --git a/core/unit_test/sycl/TestSYCL_Spaces.cpp b/core/unit_test/sycl/TestSYCL_Spaces.cpp index 914f8432488..a4fd053e83d 100644 --- a/core/unit_test/sycl/TestSYCL_Spaces.cpp +++ b/core/unit_test/sycl/TestSYCL_Spaces.cpp @@ -21,235 +21,192 @@ namespace Test { TEST(sycl, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::SYCLSharedUSMSpace>::Space, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::Experimental::SYCLHostUSMSpace>::value, - ""); + Kokkos::Experimental::SYCLHostUSMSpace>::value); static_assert( std::is_same< Kokkos::Device, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLSharedUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLHostUSMSpace>::Space, + Kokkos::HostSpace>::accessible); } TEST(sycl, uvm) { diff --git a/core/unit_test/tools/TestProfilingSection.cpp b/core/unit_test/tools/TestProfilingSection.cpp index 318766ac455..9d35d67feb0 100644 --- a/core/unit_test/tools/TestProfilingSection.cpp +++ b/core/unit_test/tools/TestProfilingSection.cpp @@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) { } using Kokkos::Profiling::ProfilingSection; -static_assert(!std::is_default_constructible::value, ""); -static_assert(!std::is_copy_constructible::value, ""); -static_assert(!std::is_move_constructible::value, ""); -static_assert(!std::is_copy_assignable::value, ""); -static_assert(!std::is_move_assignable::value, ""); +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); From fcb0452d0d14ca865b057b7c14941106f3f374ff Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 Nov 2023 17:54:28 +0000 Subject: [PATCH 02/30] OpenMPTarget: Guard scratch memory usage in ParallelReduce --- core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp | 1 + core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp | 1 + .../Kokkos_OpenMPTarget_ParallelReduce_Range.hpp | 6 +++++- .../Kokkos_OpenMPTarget_ParallelReduce_Team.hpp | 7 ++++++- .../OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp | 7 ++++++- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 1902c38409a..ea434b39533 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -75,6 +75,7 @@ int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; +std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index a84de76aad0..2a7063b966a 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -750,6 +750,7 @@ class OpenMPTargetExec { int64_t thread_local_bytes, int64_t league_size); static void* m_scratch_ptr; + static std::mutex m_mutex_scratch_ptr; static int64_t m_scratch_size; static int* m_lock_array; static uint64_t m_lock_size; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4452af3846d..caa568a8925 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,6 +55,9 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: @@ -105,7 +108,8 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} + m_result_ptr_num_elems(arg_result_view.size()), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index a302fa71511..8abffa47a43 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,6 +470,10 @@ class ParallelReduce m_scratch_memory_lock; + public: void execute() const { const FunctorType& functor = m_functor_reducer.get_functor(); @@ -517,7 +521,8 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 41e62ce6e6b..6878531730d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -438,6 +438,10 @@ class ParallelReduce m_scratch_memory_lock; + public: inline void execute() const { execute_tile( @@ -452,7 +456,8 @@ class ParallelReduce::accessible) {} + typename ViewType::memory_space>::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} template inline std::enable_if_t execute_tile(const FunctorType& functor, From 26464df04cc9fd24091b0fe719bfd85e2900ee97 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 Nov 2023 13:39:27 -0500 Subject: [PATCH 03/30] SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path (#6534) * SYCL: Implement DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION path * Sync with desul * [ci skip] Try improving comments * Configure DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION instead of compiler flag * Print SYCL_EXT_ONEAPI_DEVICE_GLOBAL in configuration --- cmake/kokkos_arch.cmake | 13 ++- core/src/CMakeLists.txt | 3 + core/src/SYCL/Kokkos_SYCL.cpp | 5 ++ .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelFor_Range.hpp | 2 + .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 2 + .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 2 + .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 2 + tpls/desul/Config.hpp.cmake.in | 1 + .../include/desul/atomics/Lock_Array_SYCL.hpp | 80 +++++++++++++++++-- tpls/desul/src/Lock_Array_SYCL.cpp | 21 ++--- 13 files changed, 113 insertions(+), 24 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 5d857f42fdc..920ce8eadfc 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -585,16 +585,20 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() # Check support for device_global variables -# FIXME_SYCL Even if SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we still can't -# use device global variables with shared libraries -IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) +# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device +# global variables with shared libraries using the "non-separable compilation" +# implementation. Otherwise, the feature is not supported when building shared +# libraries. Thus, we don't even check for support if shared libraries are +# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. +IF(KOKKOS_ENABLE_SYCL) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") INCLUDE(CheckCXXSymbolExists) CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSE() + ELSEIF(NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" #include @@ -614,6 +618,7 @@ IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + # Only the separable compilation implementation is supported. COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED ) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 012af0a7d06..a4edf1ba160 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -18,6 +18,9 @@ IF (NOT desul_FOUND) ENDIF() IF(KOKKOS_ENABLE_SYCL) SET(DESUL_ATOMICS_ENABLE_SYCL ON) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + ENDIF() ENDIF() IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 7fa935f693a..af64b6908d4 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -99,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index f4fada570b0..7fbf5420f83 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor, const BarePolicy bare_policy(m_policy); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 9c5767d209f..b4de7eb89ff 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 1f2629407b0..f8abdf8443d 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index bc2e47658ed..953d2235b31 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -103,6 +103,8 @@ class Kokkos::Impl::ParallelReduce global_mem; sycl::device_ptr group_results; + desul::ensure_sycl_lock_arrays_on_device(q); + auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { sycl::local_accessor num_teams_done(1, cgh); diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in index a7bc738191e..614c2352b9e 100644 --- a/tpls/desul/Config.hpp.cmake.in +++ b/tpls/desul/Config.hpp.cmake.in @@ -14,6 +14,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_HIP #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP #endif diff --git a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp index 8216f9a797c..e1170ed2aae 100644 --- a/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp @@ -57,14 +57,35 @@ void finalize_lock_arrays_sycl(sycl::queue q); * \brief This global variable in SYCL space is what kernels use to get access * to the lock arrays. * - * There is only one single instance of this global variable for the entire - * executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose - * declaration here must be extern). This one instance will be initialized - * by initialize_host_sycl_lock_arrays and need not be modified afterwards. + * When relocatable device code is enabled, there is only one single instance of this + * global variable for the entire executable, whose definition will be in + * Kokkos_SYCL_Locks.cpp (and whose declaration here must then be extern). This one + * instance will be initialized by initialize_host_sycl_lock_arrays and need not be + * modified afterwards. + * + * When relocatable device code is disabled, an instance of this variable will be + * created in every translation unit that sees this header file (we make this clear by + * marking it static, meaning no other translation unit can link to it). Since the + * Kokkos_SYCL_Locks.cpp translation unit cannot initialize the instances in other + * translation units, we must update this SYCL global variable based on the Host global + * variable prior to running any kernels that will use it. That is the purpose of the + * ensure_sycl_lock_arrays_on_device function. */ -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_DEVICE; -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_NODE; #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue q) { + static bool once = [&q]() { +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); +#else + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); +#endif + return true; + }(); + (void)once; +} + #else // not supported template @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) { inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue) { +} + #endif } // namespace Impl + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {} +#else +static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) { + Impl::copy_sycl_lock_arrays_to_device(q); +} +#endif + } // namespace desul #endif diff --git a/tpls/desul/src/Lock_Array_SYCL.cpp b/tpls/desul/src/Lock_Array_SYCL.cpp index 9e84c60e41a..6660c76e11a 100644 --- a/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/tpls/desul/src/Lock_Array_SYCL.cpp @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul::Impl { +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#endif int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; @@ -31,19 +33,7 @@ void init_lock_arrays_sycl(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = sycl::malloc_host(SYCL_SPACE_ATOMIC_MASK + 1, q); - // FIXME_SYCL Once supported, the following should be replaced by - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, - // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, - // sizeof(int32_t*)); - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, - // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, - // sizeof(int32_t*)); - auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; - auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; - q.single_task([=] { - SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; - SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; - }); + copy_sycl_lock_arrays_to_device(q); q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, 0, @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl(sycl::queue q) { sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION + copy_sycl_lock_arrays_to_device(q); +#endif } -} // namespace desul::Impl +} // namespace desul::Impl #endif From c8b4fe848daf494ad9802040dc6850f9a939f19e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Nov 2023 09:11:54 -1000 Subject: [PATCH 04/30] Desul atomics: Trade SYCL-specific compile definition for a macro defintion in the configuration header --- cmake/kokkos_arch.cmake | 5 ++--- core/src/Kokkos_Atomics_Desul_Config.hpp | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 920ce8eadfc..986e1b5bfb8 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -595,9 +595,8 @@ IF(KOKKOS_ENABLE_SYCL) INCLUDE(CheckCXXSymbolExists) CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) ELSEIF(NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" @@ -620,7 +619,7 @@ IF(KOKKOS_ENABLE_SYCL) IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + DEFAULT -fsycl-device-code-split=off ) ENDIF() ENDIF() diff --git a/core/src/Kokkos_Atomics_Desul_Config.hpp b/core/src/Kokkos_Atomics_Desul_Config.hpp index 4cf170f5f13..0523b21c513 100644 --- a/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/core/src/Kokkos_Atomics_Desul_Config.hpp @@ -32,4 +32,8 @@ static_assert(false, #define DESUL_CUDA_ARCH_IS_PRE_VOLTA #endif +#if defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) +#define DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED +#endif + #endif // KOKKOS_ATOMICS_DESUL_CONFIG_HPP From 0a83695e5c53ee255c880bc2e10c0a4986dcc315 Mon Sep 17 00:00:00 2001 From: Shihab Shahriar Khan Date: Thu, 9 Nov 2023 14:06:51 -0500 Subject: [PATCH 05/30] Replace Marsaglia polar method with Box-muller to generate a normally distributed random number (#6556) * Kokkos Random: Replace Marsaglia polar method with Box-muller to generate a normally distributed random number * Apply clang-formatting * Add const qualifier to some internal variables * Update Kokkos_Random.hpp --- algorithms/src/Kokkos_Random.hpp | 34 +++++++++++++++----------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp index 2d7d236d2fc..89126609885 100644 --- a/algorithms/src/Kokkos_Random.hpp +++ b/algorithms/src/Kokkos_Random.hpp @@ -849,18 +849,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1094,18 +1093,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION From 3f773d057b37ea59a1cfe3e0a098d5dbe157ee47 Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 9 Nov 2023 11:38:07 -0800 Subject: [PATCH 06/30] OpenMP: No memset in viewfill (#6573) * OpenMP: Edit copyview implementation. * OpenMP: Use memset for Serial backend. * Update core/src/Kokkos_CopyViews.hpp OpenMP: do not use memset for 0's only if execution space is OpenMP Co-authored-by: Daniel Arndt * Fix incorrect code. --------- Co-authored-by: Rahulkumar Gayatri Co-authored-by: Daniel Arndt --- core/src/Kokkos_CopyViews.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index a0ca55be704..3f02748c9cc 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -1348,13 +1348,14 @@ inline std::enable_if_t< contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) + // With OpenMP, using memset has significant performance issues. + if (Impl::is_zero_byte(value) +#ifdef KOKKOS_ENABLE_OPENMP + && !std::is_same_v +#endif + ) ZeroMemset>(exec_space, dst, value); else -#endif contiguous_fill(exec_space, dst, value); } From 81e308e7da46aaa4ab040789a2715a7ebf99d200 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 Nov 2023 09:49:28 -1000 Subject: [PATCH 07/30] Revert "Desul atomics: Trade SYCL-specific compile definition for a macro defintion in the configuration header" --- cmake/kokkos_arch.cmake | 5 +++-- core/src/Kokkos_Atomics_Desul_Config.hpp | 4 ---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 986e1b5bfb8..920ce8eadfc 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -595,8 +595,9 @@ IF(KOKKOS_ENABLE_SYCL) INCLUDE(CheckCXXSymbolExists) CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - # Use the non-separable compilation implementation to support shared libraries as well. SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. + COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) ELSEIF(NOT BUILD_SHARED_LIBS) INCLUDE(CheckCXXSourceCompiles) CHECK_CXX_SOURCE_COMPILES(" @@ -619,7 +620,7 @@ IF(KOKKOS_ENABLE_SYCL) IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED ) ENDIF() ENDIF() diff --git a/core/src/Kokkos_Atomics_Desul_Config.hpp b/core/src/Kokkos_Atomics_Desul_Config.hpp index 0523b21c513..4cf170f5f13 100644 --- a/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/core/src/Kokkos_Atomics_Desul_Config.hpp @@ -32,8 +32,4 @@ static_assert(false, #define DESUL_CUDA_ARCH_IS_PRE_VOLTA #endif -#if defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) -#define DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -#endif - #endif // KOKKOS_ATOMICS_DESUL_CONFIG_HPP From 97a90d5dd22afafebb5c0202ed8ab63724566304 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 9 Nov 2023 13:14:52 -1000 Subject: [PATCH 08/30] OpenACC: add atomics support (#6446) * Initial OpenACC atomic construct implementation. * Partially fixed bugs in the OpenACC atomic implementations. * First working version, where general atomic implementations work only on a sequential host * Update Unit Test CMake * Update as suggested by code review: - Remove const_cast() - Change Kokkos::abort() with printf() - Add FIXME_OPENACC comment. * Disable unsupporte OpenACC atomic tests. (OpenACC C/C++ does not support atomic max/min/mod operations) Disable TestOpenACC_BitManipulationBuiltins for OpenACC due to errors. * Apply ClangFormat * Disable unsupported unit tests when by old NVHPC compilers (V22.5 or older). * Apply ClangFormat * Update tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp Co-authored-by: Damien L-G * Restore unit tests that were disabled for old NVHPC compilers (V22.5 or older) * Update unit test CMakeLists.txt to include unit tests enabled by this PR. * Change the minimum version of the NVHPC compiler from 22.3 to 22.9 for the OpenACC backend. * Changed the way to guard unsupported atomic tests for the OpenACC backend. * Remove unnecessary guarding on unsupported atomic tests for the OpenACC backend. * Minor updates according to the code review. * Changed the supported-type-checking code from macro to alias template as suggested by the code review. * Undo changing the minimim required version of NVHPC. * Apply suggestions from code review Co-authored-by: Damien L-G * Change the KOKKOS_COMPILER_NVHPC macro to __NVCOMPILER * Rename a variable's name from `tmp` to `old in atomic_fetch_*() functions. * Change `ptr[0]` to `*ptr` as suggested by the code review. * Add FIXME comments in `device_atomic_thread_fence()`. * Moved definitions into the desul::Impl namespace as suggested by the code review. * Clean up the OpenACC atomic implementations. Re-enable atomic max/min tests for OpenACC. * Fix a typo (sid => std) * Minor bug fix on OpenACC * Update .jenkins to the latest. * Test this please * Try again * Fix typo * Deal with conflicts * Disable complex float atomic unit tests for OpenACC backend * Sync with PR opened on the desul side * DO NOT MERGE disable all CI but OpenACC * - Disable atomic-fetch-shift tests for NVHPC OpenACC compiler, which fail due to compiler bugs, which are reported to NVIDIA. - Change the values of start and end variables in TestAtomicOperations_double.hpp and TestAtomicOperations_float.hpp so that atomic-division tests calculate trivial divisions. (In the original tests, NVHPC compiler failed since device results are slightly different from the host results due to precision mismatch.) * Add atomic_op_test)rel() to TestAtomicOperations.hpp, which compares the host and device atomic operations using a relative error. * Revert "DO NOT MERGE disable all CI but OpenACC" This reverts commit 18132bffc0fd5491ed4c8191377cd8a9d1692910. * [desul_atomics] Fixup Kokkos -> DESUL in error message with OpenACC --------- Co-authored-by: Seyong Lee --- Makefile.kokkos | 6 + core/src/CMakeLists.txt | 3 + core/src/Kokkos_Macros.hpp | 2 +- core/unit_test/CMakeLists.txt | 19 +- core/unit_test/TestAtomicOperations.hpp | 74 +++ core/unit_test/TestAtomics.hpp | 4 +- tpls/desul/Config.hpp.cmake.in | 1 + .../desul/atomics/Compare_Exchange.hpp | 3 + .../atomics/Compare_Exchange_OpenACC.hpp | 149 ++++++ tpls/desul/include/desul/atomics/Fetch_Op.hpp | 3 + .../desul/atomics/Fetch_Op_OpenACC.hpp | 427 ++++++++++++++++++ tpls/desul/include/desul/atomics/Generic.hpp | 48 ++ .../desul/atomics/Lock_Based_Fetch_Op.hpp | 3 + .../atomics/Lock_Based_Fetch_Op_OpenACC.hpp | 77 ++++ tpls/desul/include/desul/atomics/Macros.hpp | 28 ++ .../include/desul/atomics/Thread_Fence.hpp | 3 + .../desul/atomics/Thread_Fence_OpenACC.hpp | 25 + 17 files changed, 855 insertions(+), 20 deletions(-) create mode 100644 tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp create mode 100644 tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp diff --git a/Makefile.kokkos b/Makefile.kokkos index 7c1914e3076..97b92a32892 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1440,6 +1440,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index a4edf1ba160..b4a25c0813e 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -25,6 +25,9 @@ IF (NOT desul_FOUND) IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP ENDIF() + IF(KOKKOS_ENABLE_OPENACC) + SET(DESUL_ATOMICS_ENABLE_OPENACC ON) + ENDIF() CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index d32ab2e57b6..3f53fcba683 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -87,7 +87,7 @@ #if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) + !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENACC) #define KOKKOS_INTERNAL_NOT_PARALLEL #endif diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 38dc1364768..8c9a6684987 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -65,7 +65,7 @@ SET(KOKKOS_THREADS_NAME Threads) IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(KOKKOS_OPENACC_FEATURE_LEVEL 9) ELSE() - SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) + SET(KOKKOS_OPENACC_FEATURE_LEVEL 17) ENDIF() SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) @@ -524,17 +524,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp @@ -551,17 +541,10 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) endif() diff --git a/core/unit_test/TestAtomicOperations.hpp b/core/unit_test/TestAtomicOperations.hpp index a5aebed4138..cd7ba47aa1e 100644 --- a/core/unit_test/TestAtomicOperations.hpp +++ b/core/unit_test/TestAtomicOperations.hpp @@ -368,6 +368,63 @@ bool atomic_op_test(T old_val, T update) { return result == 0; } +template +constexpr T relative_error_threshold = T(1.0e-15); + +template +bool atomic_op_test_rel(T old_val, T update) { + Kokkos::View op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (expected_val == T(0)) { + if (fabs(op_data(0)) > relative_error_threshold) local_result += 1; + if (fabs(op_data(1)) > relative_error_threshold) local_result += 2; + if (fabs(op_data(2)) > relative_error_threshold) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs(fetch_result.second) > relative_error_threshold) + local_result += 16; + } else { + if (fabs((op_data(0) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 1; + if (fabs((op_data(1) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 2; + if (fabs((op_data(2) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs((fetch_result.second - expected_val) / expected_val) > + relative_error_threshold) + local_result += 16; + } + }, + result); + if ((result & 1) != 0) + printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + if ((result & 2) != 0) + printf("atomic_fetch_%s failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 4) != 0) + printf("atomic_%s_fetch failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 8) != 0) + printf("atomic_fetch_%s did not return old value with type %s\n", + Op::name(), typeid(T).name()); + if ((result & 16) != 0) + printf("atomic_%s_fetch did not return updated value with type %s\n", + Op::name(), typeid(T).name()); + + return result == 0; +} + //--------------------------------------------------- //--------------atomic_test_control------------------ //--------------------------------------------------- @@ -395,6 +452,12 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 9: return atomic_op_test(old_val, update); case 10: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC + // compiler bugs, which are reported to NVIDIA. + case 11: return true; + case 12: return true; +#else case 11: return update_in >= 0 ? atomic_op_test( old_val, update) @@ -403,6 +466,7 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { return update_in >= 0 ? atomic_op_test( old_val, update) : true; +#endif case 13: return atomic_op_test(old_val, update); case 14: @@ -440,10 +504,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, case 2: return atomic_op_test(old_val, update); case 3: return atomic_op_test(old_val, update); case 4: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // NVHPC may use different internal precisions for the device and host + // atomic operations. Therefore, relative errors are used to compare the + // host results and device results. + case 5: + return update != 0 ? atomic_op_test_rel( + old_val, update) + : true; +#else case 5: return update != 0 ? atomic_op_test(old_val, update) : true; +#endif case 6: return atomic_op_test(old_val, update); } diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index 2b40f12d0a4..5f48e8c9746 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -498,7 +498,9 @@ TEST(TEST_CATEGORY, atomics) { ASSERT_TRUE((TestAtomic::Loop(100, 2))); ASSERT_TRUE((TestAtomic::Loop(100, 3))); -#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET + // FIXME_OPENACC: atomic operations on composite types are not supported. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 1))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 2))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 3))); diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in index 614c2352b9e..aed7ecfabc9 100644 --- a/tpls/desul/Config.hpp.cmake.in +++ b/tpls/desul/Config.hpp.cmake.in @@ -16,5 +16,6 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_SYCL #cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP +#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC #endif diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp index e91569e1dee..72639fc4932 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp new file mode 100644 index 00000000000..225079c15db --- /dev/null +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ +#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ + +#include + +#include +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { + if constexpr (std::is_arithmetic_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // Floating point types treated separetely to work around compiler errors + // "parse invalid cast opcode for cast from 'i32' to 'float'". + // Also not just "forwarding" arguments to atomicCAS because it does not have an + // overload that takes int64_t + if constexpr (std::is_integral_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + static_assert(sizeof(unsigned int) == 4); + static_assert(sizeof(unsigned long long int) == 8); + using cas_t = + std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>; + cas_t return_val = atomicCAS(reinterpret_cast(dest), + reinterpret_cast(compare), + reinterpret_cast(value)); + return reinterpret_cast(return_val); +#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL + } else if constexpr (std::is_same_v) { +#else + } else if constexpr (std::is_same_v || std::is_same_v) { +#endif + return atomicCAS(dest, compare, value); + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; + } +} + +#else // not NVHPC + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) { + if constexpr (std::is_arithmetic_v) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation " + "in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; +} + +#endif + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Fetch_Op.hpp index adf75c57437..1b161397c74 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op.hpp @@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp new file mode 100644 index 00000000000..10294c423f9 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -0,0 +1,427 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ +#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ + +#include // min, max +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +template +inline constexpr bool is_openacc_integral_type_v = + std::is_same_v || std::is_same_v || + std::is_same_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v || +#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL + std::is_same_v || +#endif + is_openacc_integral_type_v; + +#else + +template +inline constexpr bool is_openacc_integral_type_v = std::is_integral_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v; + +#endif + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_add( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_inc( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_sub( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_dec( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_mul( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr *= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_div( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr /= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_lshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr << val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_rshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr >> val; + } + return old; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_max( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; + old = atomicMax(ptr, val); + return old; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_min( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + int old; + old = atomicMin(ptr, val); + return old; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_and( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr &= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_or( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr |= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_xor( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr ^= val; + } + return old; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_add_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_inc_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_sub_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_dec_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_mul_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr *= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_div_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr /= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_lshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr << val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_rshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr >> val; + tmp = *ptr; + } + return tmp; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_max_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMax(ptr, val); + tmp = std::max(tmp, val); + return tmp; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_min_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMin(ptr, val); + tmp = std::min(tmp, val); + return tmp; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_and_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr &= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_or_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr |= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_xor_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr ^= val; + tmp = *ptr; + } + return tmp; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} +// + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Generic.hpp b/tpls/desul/include/desul/atomics/Generic.hpp index fef10222e34..fa71477c299 100644 --- a/tpls/desul/include/desul/atomics/Generic.hpp +++ b/tpls/desul/include/desul/atomics/Generic.hpp @@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);) DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);) } + +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { @@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) { @@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc } // Fetch_Oper atomics: return value before operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, const unsigned int val, @@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, const unsigned int val, @@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, } // Oper Fetch atomics: return value after operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, const unsigned int val, @@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, const unsigned int val, @@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, // Other atomics +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, MemoryOrder order, @@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_store(T* const dest, const T val, @@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_add(T* const dest, const T val, @@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, const T val, @@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, const T val, @@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_div(T* const dest, const T val, @@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_min(T* const dest, const T val, @@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_max(T* const dest, const T val, @@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, MemoryOrder order, @@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, MemoryOrder order, @@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, MemoryOrder order, @@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, MemoryOrder order, @@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, MemoryOrder order, @@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, MemoryOrder order, @@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, } // FIXME +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp new file mode 100644 index 00000000000..6b78ce39043 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ + +#include +#include +#include +#include + +namespace desul { +namespace Impl { + +template = 0> +inline T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +template = 0> +inline T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 3a14b93d323..d11beb0c805 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_OPENMP_ATOMICS #endif +#if defined(DESUL_ATOMICS_ENABLE_OPENACC) +#define DESUL_HAVE_OPENACC_ATOMICS +#endif + // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS @@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #endif +#if defined(DESUL_HAVE_OPENACC_ATOMICS) +#include +#ifdef __NVCOMPILER +// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or +// on device with NVHPC. We use the device implementation on both sides. +#define DESUL_IF_ON_DEVICE(CODE) \ + { DESUL_IMPL_STRIP_PARENS(CODE) } +#define DESUL_IF_ON_HOST(CODE) \ + {} +#else +#define DESUL_IF_ON_DEVICE(CODE) \ + if constexpr (acc_on_device(acc_device_not_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#define DESUL_IF_ON_HOST(CODE) \ + if constexpr (acc_on_device(acc_device_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#endif +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq") +#else +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE +#endif + #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ diff --git a/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/tpls/desul/include/desul/atomics/Thread_Fence.hpp index 24078aae07f..6a741f6d478 100644 --- a/tpls/desul/include/desul/atomics/Thread_Fence.hpp +++ b/tpls/desul/include/desul/atomics/Thread_Fence.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp new file mode 100644 index 00000000000..a5c8aa1c8a7 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp @@ -0,0 +1,25 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ +#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ + +namespace desul { +namespace Impl { + +#pragma acc routine seq +template +void device_atomic_thread_fence(MemoryOrder, MemoryScope) { + // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence + // operations. +} + +} // namespace Impl +} // namespace desul + +#endif From d5a4802911318aebecbc775990dd198260ce2383 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 9 Nov 2023 21:52:37 -0500 Subject: [PATCH 09/30] Fix infinity, quiet_NaN, signaling_Nan, isfinite, isnan, isinf for half_t and bhalf_t (#6543) * Fix nvcc warning for non-trivial types in bit_cast * Introduce BitComparisonWrapper * Implement isnan, isfinite, isinf for half_t, bhalf_t with bit comparison * Fix infinity, quiet_NaN, signaling_NaN for half_t, bhalf_t * Improve tests * Disable TestCuda_WithoutInitializing for NVHPC * Define exponent/fraction_mask in FloatingPointWrapper.hpp * Minimize changes to TestMathematicalFunctions.hpp * Enable tests for inf, quiet_nan, signaling_nan for half_t and bhalf_t * Don't repeat storage class specifier in template specialization * Try inline constexpr and move definitons for the same type together * Disable numeric traits unit tests for NVHPC * Define comparison operators for BitComparisonWrapper * Fix TestNumericTraits, no constexpr consructor for [b]half_t --- containers/unit_tests/CMakeLists.txt | 5 + core/src/Kokkos_BitManipulation.hpp | 2 +- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 68 ++++++- .../Kokkos_Half_MathematicalFunctions.hpp | 80 +++++++- core/src/impl/Kokkos_Half_NumericTraits.hpp | 20 +- core/unit_test/TestMathematicalFunctions.hpp | 179 ++++++++++++++++-- core/unit_test/TestNumericTraits.hpp | 24 ++- 7 files changed, 342 insertions(+), 36 deletions(-) diff --git a/containers/unit_tests/CMakeLists.txt b/containers/unit_tests/CMakeLists.txt index b777581043d..e69e46bb6a8 100644 --- a/containers/unit_tests/CMakeLists.txt +++ b/containers/unit_tests/CMakeLists.txt @@ -46,6 +46,11 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() + # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 + if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + endif() + KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/core/src/Kokkos_BitManipulation.hpp b/core/src/Kokkos_BitManipulation.hpp index f3baf71660e..f5653aaba34 100644 --- a/core/src/Kokkos_BitManipulation.hpp +++ b/core/src/Kokkos_BitManipulation.hpp @@ -115,7 +115,7 @@ bit_cast(From const& from) noexcept { return sycl::bit_cast(from); #else To to; - memcpy(&to, &from, sizeof(To)); + memcpy(static_cast(&to), static_cast(&from), sizeof(To)); return to; #endif } diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index b1ff643a71e..4a22898d168 100644 --- a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -18,6 +18,7 @@ #define KOKKOS_HALF_FLOATING_POINT_WRAPPER_HPP_ #include +#include // bit_cast #include #include // istream & ostream for extraction and insertion ops @@ -215,10 +216,70 @@ cast_from_wrapper(const Kokkos::Experimental::bhalf_t& x); /************************** END forward declarations **************************/ namespace Impl { + +template +struct BitComparisonWrapper { + std::uint16_t value; + + template + KOKKOS_FUNCTION friend bool operator==(BitComparisonWrapper a, Number b) { + return static_cast(a) == b; + } + + template + KOKKOS_FUNCTION friend bool operator!=(BitComparisonWrapper a, Number b) { + return static_cast(a) != b; + } + + template + KOKKOS_FUNCTION friend bool operator<(BitComparisonWrapper a, Number b) { + return static_cast(a) < b; + } + + template + KOKKOS_FUNCTION friend bool operator<=(BitComparisonWrapper a, Number b) { + return static_cast(a) <= b; + } + + template + KOKKOS_FUNCTION friend bool operator>(BitComparisonWrapper a, Number b) { + return static_cast(a) > b; + } + + template + KOKKOS_FUNCTION friend bool operator>=(BitComparisonWrapper a, Number b) { + return static_cast(a) >= b; + } +}; + +template +inline constexpr BitComparisonWrapper exponent_mask; +template +inline constexpr BitComparisonWrapper fraction_mask; + +#ifdef KOKKOS_IMPL_HALF_TYPE_DEFINED +template <> +inline constexpr BitComparisonWrapper + exponent_mask{0b0'11111'0000000000}; +template <> +inline constexpr BitComparisonWrapper + fraction_mask{0b0'00000'1111111111}; +#endif + +#ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED +template <> +inline constexpr BitComparisonWrapper + exponent_mask{0b0'11111111'0000000}; +template <> +inline constexpr BitComparisonWrapper + fraction_mask{0b0'00000000'1111111}; +#endif + template class alignas(FloatType) floating_point_wrapper { public: - using impl_type = FloatType; + using impl_type = FloatType; + using bit_comparison_type = BitComparisonWrapper; private: impl_type val; @@ -269,6 +330,11 @@ class alignas(FloatType) floating_point_wrapper { #endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH } + KOKKOS_FUNCTION + floating_point_wrapper(bit_comparison_type rhs) { + val = Kokkos::bit_cast(rhs); + } + // Don't support implicit conversion back to impl_type. // impl_type is a storage only type on host. KOKKOS_FUNCTION diff --git a/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp b/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp index 55e0cf0c8ff..e6a5cadc67c 100644 --- a/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp +++ b/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp @@ -18,6 +18,7 @@ #define KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_ #include // For the float overloads +#include // bit_cast // clang-format off namespace Kokkos { @@ -74,7 +75,7 @@ namespace Kokkos { KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long) \ KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long long) \ KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long long) - + #define KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF(FUNC, HALF_TYPE) \ KOKKOS_INLINE_FUNCTION bool FUNC(HALF_TYPE x) { \ @@ -155,10 +156,77 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, nextaf KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, copysign) // Classification and comparison functions // fpclassify -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, isfinite) -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, isinf) -#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, isnan) + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isfinite(Kokkos::Experimental::half_t x) { + using bit_type = Kokkos::Experimental::half_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return (bit_pattern_x.value & exponent_mask.value) != exponent_mask.value; +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isfinite(Kokkos::Experimental::bhalf_t x) { + using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return (bit_pattern_x.value & exponent_mask.value) != exponent_mask.value; +} +#endif + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isinf(Kokkos::Experimental::half_t x) { + using bit_type = Kokkos::Experimental::half_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) == 0)); +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isinf(Kokkos::Experimental::bhalf_t x) { + using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) == 0)); +} +#endif + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isnan(Kokkos::Experimental::half_t x) { + using bit_type = Kokkos::Experimental::half_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) != 0)); +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isnan(Kokkos::Experimental::bhalf_t x) { + using bit_type = Kokkos::Experimental::bhalf_t::bit_comparison_type; + constexpr bit_type exponent_mask = Kokkos::Experimental::Impl::exponent_mask; + constexpr bit_type fraction_mask = Kokkos::Experimental::Impl::fraction_mask; + const bit_type bit_pattern_x = bit_cast( + static_cast(x)); + return ( + ((bit_pattern_x.value & exponent_mask.value) == exponent_mask.value) && + ((bit_pattern_x.value & fraction_mask.value) != 0)); +} #endif // isnormal KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, signbit) @@ -188,4 +256,4 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF, imag) #undef KOKKOS_IMPL_MATH_H_FUNC_WRAPPER } // namespace Kokkos // clang-format on -#endif // KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_ \ No newline at end of file +#endif // KOKKOS_HALF_MATHEMATICAL_FUNCTIONS_HPP_ diff --git a/core/src/impl/Kokkos_Half_NumericTraits.hpp b/core/src/impl/Kokkos_Half_NumericTraits.hpp index b5cbf22194c..9ccad45e977 100644 --- a/core/src/impl/Kokkos_Half_NumericTraits.hpp +++ b/core/src/impl/Kokkos_Half_NumericTraits.hpp @@ -70,7 +70,7 @@ /// template <> struct Kokkos::Experimental::Impl::infinity_helper { - static constexpr int value = 0x7C00; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'0000000000}; }; /// \brief: Minimum normalized number @@ -157,30 +157,30 @@ struct Kokkos::Experimental::Impl::norm_min_helper< /// \brief: Quiet not a half precision number /// -/// IEEE 754 defines this as all exponent bits high. +/// IEEE 754 defines this as all exponent bits and the first fraction bit high. /// /// Quiet NaN in binary16: /// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// [0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] /// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 template <> struct Kokkos::Experimental::Impl::quiet_NaN_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0xfc000; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'1000000000}; }; /// \brief: Signaling not a half precision number /// -/// IEEE 754 defines this as all exponent bits and the first fraction bit high. +/// IEEE 754 defines this as all exponent bits and the second fraction bit high. /// /// Quiet NaN in binary16: /// [s e e e e e f f f f f f f f f f] -/// [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] +/// [0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0] /// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 template <> struct Kokkos::Experimental::Impl::signaling_NaN_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0xfe000; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11111'0100000000}; }; /// \brief: Number of digits in the matissa that can be represented @@ -267,7 +267,7 @@ struct Kokkos::Experimental::Impl::max_exponent_helper< /// template <> struct Kokkos::Experimental::Impl::infinity_helper { - static constexpr int value = 0x7F80; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'0000000}; }; // Minimum normalized number @@ -303,13 +303,13 @@ struct Kokkos::Experimental::Impl::norm_min_helper< template <> struct Kokkos::Experimental::Impl::quiet_NaN_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0x7fc000; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'1000000}; }; // Signaling not a bhalf number template <> struct Kokkos::Experimental::Impl::signaling_NaN_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0x7fe000; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111111'0100000}; }; // Number of digits in the matissa that can be represented // without losing precision. diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 4d203ead75f..7d8450eb548 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -1546,9 +1546,163 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) { // TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279 -// TODO: TestIsFinite, see https://github.com/kokkos/kokkos/issues/6279 +template +struct TestIsFinite { + TestIsFinite() { run(); } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, 1), *this, errors); + ASSERT_EQ(errors, 0); + } + KOKKOS_FUNCTION void operator()(int, int& e) const { + using KE::infinity; + using KE::quiet_NaN; + using KE::signaling_NaN; + using Kokkos::isfinite; + if (!isfinite(1) || !isfinite(INT_MAX)) { + ++e; + Kokkos::printf("failed isfinite(integral)\n"); + } + if (!isfinite(2.f) || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value)) { + ++e; + Kokkos::printf("failed isfinite(float)\n"); + } + if (!isfinite(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isfinite(KE::half_t)\n"); + } + if (!isfinite(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isfinite(KE::bhalf_t)\n"); + } + if (!isfinite(3.) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isfinite(double)\n"); + } +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + if (!isfinite(4.l) || isfinite(quiet_NaN::value) || + isfinite(signaling_NaN::value) || + isfinite(infinity::value)) { + ++e; + Kokkos::printf("failed isfinite(long double)\n"); + } +#endif + // special values + if (isfinite(INFINITY) || isfinite(NAN)) { + ++e; + Kokkos::printf("failed isfinite(floating_point) special values\n"); + } + + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + static_assert(std::is_same::value); +#endif + } +}; + +TEST(TEST_CATEGORY, mathematical_functions_isfinite) { + TestIsFinite(); +} + +template +struct TestIsInf { + TestIsInf() { run(); } + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, 1), *this, errors); + ASSERT_EQ(errors, 0); + } + KOKKOS_FUNCTION void operator()(int, int& e) const { + using KE::infinity; + using KE::quiet_NaN; + using KE::signaling_NaN; + using Kokkos::isinf; + if (isinf(1) || isinf(INT_MAX)) { + ++e; + Kokkos::printf("failed isinf(integral)\n"); + } + if (isinf(2.f) || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || !isinf(infinity::value)) { + ++e; + Kokkos::printf("failed isinf(float)\n"); + } + if (isinf(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isinf(KE::half_t)\n"); + } + if (isinf(static_cast(2.f)) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isinf(KE::bhalf_t)\n"); + } + if (isinf(3.) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || !isinf(infinity::value) +#endif + ) { + ++e; + Kokkos::printf("failed isinf(double)\n"); + } +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + if (isinf(4.l) || isinf(quiet_NaN::value) || + isinf(signaling_NaN::value) || + !isinf(infinity::value)) { + ++e; + Kokkos::printf("failed isinf(long double)\n"); + } +#endif + // special values + if (!isinf(INFINITY) || isinf(NAN)) { + ++e; + Kokkos::printf("failed isinf(floating_point) special values\n"); + } -// TODO: TestIsInf, see https://github.com/kokkos/kokkos/issues/6279 + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + static_assert(std::is_same::value); +#endif + } +}; + +TEST(TEST_CATEGORY, mathematical_functions_isinf) { + TestIsInf(); +} template struct TestIsNaN { @@ -1559,6 +1713,7 @@ struct TestIsNaN { ASSERT_EQ(errors, 0); } KOKKOS_FUNCTION void operator()(int, int& e) const { + using KE::infinity; using KE::quiet_NaN; using KE::signaling_NaN; using Kokkos::isnan; @@ -1567,35 +1722,34 @@ struct TestIsNaN { Kokkos::printf("failed isnan(integral)\n"); } if (isnan(2.f) || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value)) { + !isnan(signaling_NaN::value) || isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(float)\n"); } -#if !defined(KOKKOS_ENABLE_SYCL) && \ - !defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP if (isnan(static_cast(2.f)) -#if !defined(KOKKOS_ENABLE_CUDA) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) + !isnan(signaling_NaN::value) || + isnan(infinity::value) #endif ) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) -#if !defined(KOKKOS_ENABLE_CUDA) +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) + !isnan(signaling_NaN::value) || + isnan(infinity::value) #endif ) { ++e; KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); } -#endif if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value) + !isnan(signaling_NaN::value) || isnan(infinity::value) #endif ) { ++e; @@ -1603,7 +1757,8 @@ struct TestIsNaN { } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (isnan(4.l) || !isnan(quiet_NaN::value) || - !isnan(signaling_NaN::value)) { + !isnan(signaling_NaN::value) || + isnan(infinity::value)) { ++e; Kokkos::printf("failed isnan(long double)\n"); } diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 3c159ebb341..421eac022cc 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -101,8 +101,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(Infinity, int, int& e) const { using Kokkos::Experimental::infinity; - auto const inf = infinity::value; - auto const zero = T(0); + constexpr auto inf = infinity::value; + auto const zero = T(0); e += (int)!(inf + inf == inf); e += (int)!(inf != zero); use_on_device(); @@ -147,8 +147,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(QuietNaN, int, int& e) const { #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::quiet_NaN; - constexpr auto nan = quiet_NaN::value; - constexpr auto zero = T(0); + constexpr auto nan = quiet_NaN::value; + auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); #else @@ -159,8 +159,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(SignalingNaN, int, int& e) const { #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::signaling_NaN; - constexpr auto nan = signaling_NaN::value; - constexpr auto zero = T(0); + constexpr auto nan = signaling_NaN::value; + auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); #else @@ -204,6 +204,10 @@ struct TestNumericTraits< #endif TEST(TEST_CATEGORY, numeric_traits_infinity) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -387,6 +391,14 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { #endif } TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 + TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); From 61b93ec7fb8c426c275525bc1754d2a4f2741a92 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Tue, 14 Nov 2023 08:28:51 +0000 Subject: [PATCH 10/30] kokkos(unique): fix allocation of temporary view to enfore using the provided space instance --- algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp index 11afa8ed6e0..28635824585 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp @@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label, // using the same algorithm used for unique_copy but we now move things using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore); + tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing, + "std_unique_tmp_view"), + num_elements_to_explore); // scan extent is: num_elements_to_explore - 1 // for same reason as the one explained in unique_copy From 9c37437eaa361217f5af9201449900fa4ef37491 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 15 Nov 2023 10:57:50 -0700 Subject: [PATCH 11/30] Use binary wrapper for consistency in definition of half types numeric traits (#6590) * Use binary wrapper for consistency in definition of half types numeric traits finite_{min,max}, epsilon, norm_min, and round_error * Fix epsilon usage in TestHalfOperators.hpp * Enable more test in TestNumericTraits.hpp * Fix norm_min * Remove unimplemented features * Guard NumericTraits tests for NVHPC * Define epsilon explicitly for NVHPC and [b]half_t * Fix norm_min bhalf_t comment --------- Co-authored-by: Daniel Arndt --- core/src/impl/Kokkos_Half_NumericTraits.hpp | 20 ++++++++--------- core/unit_test/TestHalfOperators.hpp | 10 ++++----- core/unit_test/TestMathematicalFunctions.hpp | 18 +++++++++++++-- core/unit_test/TestNumericTraits.hpp | 23 ++++++++++++++++++++ 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/core/src/impl/Kokkos_Half_NumericTraits.hpp b/core/src/impl/Kokkos_Half_NumericTraits.hpp index 9ccad45e977..4779c2a6e10 100644 --- a/core/src/impl/Kokkos_Half_NumericTraits.hpp +++ b/core/src/impl/Kokkos_Half_NumericTraits.hpp @@ -87,7 +87,7 @@ struct Kokkos::Experimental::Impl::infinity_helper template <> struct Kokkos::Experimental::Impl::finite_min_helper< Kokkos::Experimental::half_t> { - static constexpr float value = -65504.0F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b1'11110'1111111111}; // -65504 }; /// \brief: Maximum normalized number @@ -104,7 +104,7 @@ struct Kokkos::Experimental::Impl::finite_min_helper< template <> struct Kokkos::Experimental::Impl::finite_max_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 65504.0F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'11110'1111111111}; // +65504 }; /// \brief: This is the difference between 1 and the smallest floating point @@ -123,7 +123,7 @@ struct Kokkos::Experimental::Impl::finite_max_helper< template <> struct Kokkos::Experimental::Impl::epsilon_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0.0009765625F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00101'0000000000}; // 0.0009765625 }; /// @brief: The largest possible rounding error in ULPs @@ -134,7 +134,7 @@ struct Kokkos::Experimental::Impl::epsilon_helper< template <> struct Kokkos::Experimental::Impl::round_error_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0.5F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'01110'0000000000}; // 0.5 }; /// \brief: Minimum normalized positive half precision number @@ -152,7 +152,7 @@ struct Kokkos::Experimental::Impl::round_error_helper< template <> struct Kokkos::Experimental::Impl::norm_min_helper< Kokkos::Experimental::half_t> { - static constexpr float value = 0.00006103515625F; + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00001'0000000000}; // 0.00006103515625 }; /// \brief: Quiet not a half precision number @@ -274,30 +274,30 @@ struct Kokkos::Experimental::Impl::infinity_helper struct Kokkos::Experimental::Impl::finite_min_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = -3.38953139e38; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b1'11111110'1111111}; // -3.38953139e38 }; // Maximum normalized number template <> struct Kokkos::Experimental::Impl::finite_max_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 3.38953139e38; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'11111110'1111111}; // +3.38953139e3 }; // 1/2^7 template <> struct Kokkos::Experimental::Impl::epsilon_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0.0078125F; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'01111000'0000000}; // 0.0078125 }; template <> struct Kokkos::Experimental::Impl::round_error_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 0.5F; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'01111110'0000000}; // 0.5 }; // Minimum normalized positive bhalf number template <> struct Kokkos::Experimental::Impl::norm_min_helper< Kokkos::Experimental::bhalf_t> { - static constexpr float value = 1.1754494351e-38; + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'00000001'0000000}; // 1.175494351e-38 }; // Quiet not a bhalf number template <> diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index bf7013cf738..752e3b50816 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -975,7 +975,7 @@ struct Functor_TestHalfOperators { template void __test_half_operators(half_type h_lhs, half_type h_rhs) { - double epsilon = Kokkos::Experimental::epsilon::value; + half_type epsilon = Kokkos::Experimental::epsilon::value; Functor_TestHalfOperators f_device(h_lhs, h_rhs); Functor_TestHalfOperators f_host(h_lhs, h_rhs); @@ -990,9 +990,9 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { // printf("op_test = %d\n", op_test); ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - epsilon); + static_cast(epsilon)); ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - epsilon); + static_cast(epsilon)); } // volatile-qualified parameter type 'volatile half_type' is deprecated @@ -1015,9 +1015,9 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H || op_test == CDIV_H_H) { ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - epsilon); + static_cast(epsilon)); ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - epsilon); + static_cast(epsilon)); } } #endif diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 7d8450eb548..be479e0219a 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -240,11 +240,25 @@ struct FloatingPointComparison { } #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT KOKKOS_FUNCTION - KE::half_t eps(KE::half_t) const { return KE::epsilon::value; } + KE::half_t eps(KE::half_t) const { +// FIXME_NVHPC compile-time error +#ifdef KOKKOS_COMPILER_NVHPC + return 0.0009765625F; +#else + return KE::epsilon::value; +#endif + } #endif #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT KOKKOS_FUNCTION - KE::bhalf_t eps(KE::bhalf_t) const { return KE::epsilon::value; } + KE::bhalf_t eps(KE::bhalf_t) const { +// FIXME_NVHPC compile-time error +#ifdef KOKKOS_COMPILER_NVHPC + return 0.0078125; +#else + return KE::epsilon::value; +#endif + } #endif KOKKOS_FUNCTION double eps(float) const { return FLT_EPSILON; } diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index 421eac022cc..f6fdc8376fb 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -218,6 +218,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) { } TEST(TEST_CATEGORY, numeric_traits_epsilon) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -228,6 +232,11 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) { } TEST(TEST_CATEGORY, numeric_traits_round_error) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -238,6 +247,10 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) { } TEST(TEST_CATEGORY, numeric_traits_norm_min) { +#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type + TestNumericTraits(); + TestNumericTraits(); +#endif TestNumericTraits(); TestNumericTraits(); // FIXME_NVHPC long double not supported @@ -309,6 +322,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); #if !defined(KOKKOS_ENABLE_CUDA) || \ @@ -330,6 +345,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); #if !defined(KOKKOS_ENABLE_CUDA) || \ @@ -359,6 +376,8 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); #if !defined(KOKKOS_ENABLE_CUDA) || \ @@ -368,6 +387,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { } TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { + TestNumericTraits(); + TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); From c60716df432ee5873886ef81505147bbe0072663 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 16 Nov 2023 09:20:14 +0100 Subject: [PATCH 12/30] try fix --- core/unit_test/TestDeviceAndThreads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 1d3ff8eea7e..511b182cab0 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -30,7 +30,7 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 3, 5, 7]: + for x in [1, 2, 4, 6, 8]: if x >= max_threads: break yield x From ee655c08ad2188474f82930e20083ace0383e625 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 16 Nov 2023 19:42:34 +0000 Subject: [PATCH 13/30] Fix TestNumericTriats.hpp for SYCL with bfloat16 support --- core/unit_test/TestNumericTraits.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index f6fdc8376fb..f197a2d8891 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -110,8 +110,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const { using Kokkos::Experimental::epsilon; - auto const eps = epsilon::value; - auto const one = T(1); + T const eps = epsilon::value; + T const one = 1; // Avoid higher precision intermediate representation compare() = one + eps; e += (int)!(compare() != one); From 1a145311ff8f002fa526608410c8e3685e908353 Mon Sep 17 00:00:00 2001 From: Pierre Kestener Date: Thu, 16 Nov 2023 16:57:16 +0100 Subject: [PATCH 14/30] Fix generated Makefile when using gnu_generate_makefile.sh and make >= 4.3 --- algorithms/unit_tests/Makefile | 14 +++++++------- containers/unit_tests/Makefile | 4 ++-- core/unit_test/Makefile | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/algorithms/unit_tests/Makefile b/algorithms/unit_tests/Makefile index 601217799a8..d3946c149ba 100644 --- a/algorithms/unit_tests/Makefile +++ b/algorithms/unit_tests/Makefile @@ -27,13 +27,13 @@ TARGETS = tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " > Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ ) \ ) diff --git a/containers/unit_tests/Makefile b/containers/unit_tests/Makefile index 2e35832cc89..18410882bca 100644 --- a/containers/unit_tests/Makefile +++ b/containers/unit_tests/Makefile @@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" >> Test$(device)_$(test).cpp); \ )\ ) \ ) diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 33a84b61f92..202809d3fc9 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -67,8 +67,8 @@ TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longi tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ ) \ ) @@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ )\ ) @@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > TestCuda_$(test).cpp); \ - $(shell echo "\#include " >> TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " > TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " >> TestCuda_$(test).cpp); \ )\ ) @@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -277,8 +277,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) From 8fd8c94aa553ec8e355171c00d0ece65be530ac8 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Fri, 17 Nov 2023 22:10:41 +0100 Subject: [PATCH 15/30] Threads: add missing broadcast to TeamThreadRange parallel_scan (#6601) * try * use reference --- core/src/Threads/Kokkos_Threads_Team.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index b811a7944ba..8f9614233b2 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -1001,8 +1001,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } + auto & team_member = loop_bounds.thread; + // 'scan_val' output is the exclusive prefix sum - scan_val = loop_bounds.thread.team_scan(scan_val); + scan_val = team_member.team_scan(scan_val); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1012,6 +1014,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, true); } + team_member.team_broadcast(scan_val, team_member.team_size() - 1); + return_val = scan_val; } From 932c1fb2f059ba1cbd84e0c191459cc17ee8dc23 Mon Sep 17 00:00:00 2001 From: Dong Hun Lee Date: Mon, 13 Nov 2023 18:15:02 -0700 Subject: [PATCH 16/30] Added missing operator* to NEON simd --- simd/src/Kokkos_SIMD_NEON.hpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 43ece203890..61d506eac86 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -868,7 +868,11 @@ class simd> { return simd( vadd_s32(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmul_s32(static_cast(lhs), static_cast(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1068,7 +1072,10 @@ class simd> { return simd( vaddq_s64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1261,7 +1268,10 @@ class simd> { return simd( vaddq_u64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( simd const& lhs, simd const& rhs) noexcept { return simd( From ff7104cee13d01174ff896682e1aab7333934aff Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 17 Nov 2023 14:19:24 -0700 Subject: [PATCH 17/30] [ci skip] Update changelog on develop for 4.2.00 (#6592) * [ci skip] Update changelog on develop for 4.2.00 * [ci skip] Fix whitespace --- CHANGELOG.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0534d741f3..92bb6fdbe5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,97 @@ # CHANGELOG -## [4.1.00](https://github.com/kokkos/kokkos/tree/4.0.01) (2023-06-16) +## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00) + +### Features: +- SIMD: significant improvements to SIMD support and alignment with C++26 SIMD + - add `Kokkos::abs` overload for SIMD types [\#6069](https://github.com/kokkos/kokkos/pull/6069) + - add generator constructors [\#6347](https://github.com/kokkos/kokkos/pull/6347) + - convert binary operators to hidden friends [\#6320](https://github.com/kokkos/kokkos/pull/6320) + - add shift operators [\#6109](https://github.com/kokkos/kokkos/pull/6109) + - add `float` support [\#6177](https://github.com/kokkos/kokkos/pull/6177) + - add remaining `gather_from` and `scatter_to` overloads [\#6220](https://github.com/kokkos/kokkos/pull/6220) + - define simd math function overloads in the Kokkos namespace [\#6465](https://github.com/kokkos/kokkos/pull/6465), [\#6487](https://github.com/kokkos/kokkos/pull/6487) + - `Kokkos_ENABLE_NATIVE=ON` autodetects SIMD types supported [\#6188](https://github.com/kokkos/kokkos/pull/6188) + - fix AVX2 SIMD support for ZEN2 AMD CPU [\#6238](https://github.com/kokkos/kokkos/pull/6238) +- `Kokkos::printf` [\#6083](https://github.com/kokkos/kokkos/pull/6083) +- `Kokkos::sort`: support custom comparator [\#6253](https://github.com/kokkos/kokkos/pull/6253) +- `half_t` and `bhalf_t` numeric traits [\#5778](https://github.com/kokkos/kokkos/pull/5778) +- `half_t` and `bhalf_t` mixed comparisons [\#6407](https://github.com/kokkos/kokkos/pull/6407) +- `half_t` and `bhalf_t` mathematical functions [\#6124](https://github.com/kokkos/kokkos/pull/6124) +- `TeamThreadRange` `parallel_scan` with return value [\#6090](https://github.com/kokkos/kokkos/pull/6090), [\#6301](https://github.com/kokkos/kokkos/pull/6301), [\#6302](https://github.com/kokkos/kokkos/pull/6302), [\#6303](https://github.com/kokkos/kokkos/pull/6303), [\#6307](https://github.com/kokkos/kokkos/pull/6307) +- `ThreadVectorRange` `parallel_scan` with return value [\#6235](https://github.com/kokkos/kokkos/pull/6235), [\#6242](https://github.com/kokkos/kokkos/pull/6242), [\#6308](https://github.com/kokkos/kokkos/pull/6308), [\#6305](https://github.com/kokkos/kokkos/pull/6305), [\#6292](https://github.com/kokkos/kokkos/pull/6292) +- Add team-level std algorithms [\#6200](https://github.com/kokkos/kokkos/pull/6200), [\#6205](https://github.com/kokkos/kokkos/pull/6205), [\#6207](https://github.com/kokkos/kokkos/pull/6207), [\#6208](https://github.com/kokkos/kokkos/pull/6208), [\#6209](https://github.com/kokkos/kokkos/pull/6209), [\#6210](https://github.com/kokkos/kokkos/pull/6210), [\#6211](https://github.com/kokkos/kokkos/pull/6211), [\#6212](https://github.com/kokkos/kokkos/pull/6212), [\#6213](https://github.com/kokkos/kokkos/pull/6213), [\#6256](https://github.com/kokkos/kokkos/pull/6256), [\#6258](https://github.com/kokkos/kokkos/pull/6258), [\#6350](https://github.com/kokkos/kokkos/pull/6350), [\#6351](https://github.com/kokkos/kokkos/pull/6351) +- Serial: Allow for distinct execution space instances [\#6441](https://github.com/kokkos/kokkos/pull/6441) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Fixed potential data race in Cuda `parallel_reduce` [\#6236](https://github.com/kokkos/kokkos/pull/6236) +- Use `cudaMallocAsync` by default [\#6402](https://github.com/kokkos/kokkos/pull/6402) +- Bugfix for using Kokkos from a thread of execution [\#6299](https://github.com/kokkos/kokkos/pull/6299) + +#### HIP: +- New naming convention for AMD GPU: VEGA906, VEGA908, VEGA90A, NAVI1030 to AMD_GFX906, AMD_GFX908, AMD_GFX90A, AMD_GFX1030 [\#6266](https://github.com/kokkos/kokkos/pull/6266) +- Add initial support for gfx942: [\#6358](https://github.com/kokkos/kokkos/pull/6358) +- Improve reduction performance [\#6229](https://github.com/kokkos/kokkos/pull/6229) +- Deprecate `HIP(hipStream_t,bool)` constructor [\#6401](https://github.com/kokkos/kokkos/pull/6401) +- Add support for Graph [\#6370](https://github.com/kokkos/kokkos/pull/6370) +- Improve reduction performance when using Teams [\#6284](https://github.com/kokkos/kokkos/pull/6284) +- Fix concurrency calculation [\#6479](https://github.com/kokkos/kokkos/pull/6479) +- Fix potential data race in HIP `parallel_reduce` [\#6429](https://github.com/kokkos/kokkos/pull/6429) + +#### SYCL: +- Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246) +- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) +- Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321) +- Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322) +- Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189) + +#### OpenACC: +- Support Clacc compiler [\#6250](https://github.com/kokkos/kokkos/pull/6250) + +### General Enhancements +- Add missing `is_*_view` traits and `is_*_view_v` helper variable templates for `DynRankView`, `DynamicView`, `OffsetView`, `ScatterView` containers [\#6195](https://github.com/kokkos/kokkos/pull/6195) +- Make `nvcc_wrapper` and `compiler_launcher` scripts more portable by switching to a `#!/usr/bin/env` shebang [\#6357](https://github.com/kokkos/kokkos/pull/6357) +- Add an improved `Kokkos::malloc` / `Kokkos::free` performance test [\#6377](https://github.com/kokkos/kokkos/pull/6377) +- Ensure `Views` with `size==0` can be used with `deep_copy` [\#6273](https://github.com/kokkos/kokkos/pull/6273) +- `Kokkos::abort` is moved to header `Kokkos_Abort.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445) +- `KOKKOS_ASSERT`, `KOKKOS_EXPECTS`, `KOKKOS_ENSURES` are moved to header `Kokkos_Assert.hpp` [\#6445](https://github.com/kokkos/kokkos/pull/6445) +- Add a permuted-index mode to the gups benchmark [\#6378](https://github.com/kokkos/kokkos/pull/6378) +- Check for overflow during backend initialization [\#6159](https://github.com/kokkos/kokkos/pull/6159) +- Make constraints on `Kokkos::sort` more visible [\#6234](https://github.com/kokkos/kokkos/pull/6234) and cleanup API [\#6239](https://github.com/kokkos/kokkos/pull/6239) +- Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474) + + +### Build System Changes + +- Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282) +- Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342) + +### Incompatibilities (i.e. breaking changes) + - Ensure that `Kokkos::complex` only gets instantiated for cv-unqualified floating-point types [\#6251](https://github.com/kokkos/kokkos/pull/6251) + - Removed (deprecated-3) support for volatile join operators in reductions [\#6385](https://github.com/kokkos/kokkos/pull/6385) + - Enforce `ViewCtorArgs` restrictions for `create_mirror_view` [\#6304](https://github.com/kokkos/kokkos/pull/6304) + - SIMD types for ARM NEON are not autodetected anymore but need `Kokkos_ARCH_ARM_NEON` or `Kokkos_ARCH_NATIVE=ON` [\#6394](https://github.com/kokkos/kokkos/pull/6394) + - Remove `#include ` from headers where possible [\#6482](https://github.com/kokkos/kokkos/pull/6482) + +### Deprecations +- Deprecated `Kokkos::vector` [\#6252](https://github.com/kokkos/kokkos/pull/6252) +- All host allocation mechanisms except for `STD_MALLOC` have been deprecated [\#6341](https://github.com/kokkos/kokkos/pull/6341) + +### Bug Fixes + - Missing memory fence in `RandomPool::free_state` functions [\#6290](https://github.com/kokkos/kokkos/pull/6290) + - Fix for corner case in `Kokkos::Experimental::is_partitioned` algorithm [\#6257](https://github.com/kokkos/kokkos/pull/6257) + - Fix initialization of scratch lock variables in the `Cuda` backend [\#6433](https://github.com/kokkos/kokkos/pull/6433) + - Fixes for `Kokkos::Array` [\#6372](https://github.com/kokkos/kokkos/pull/6372) + - Fixed symlink configure issue for Windows [\#6241](https://github.com/kokkos/kokkos/pull/6241) + - OpenMPTarget init-join fix [\#6444](https://github.com/kokkos/kokkos/pull/6444) + - Fix atomic operations bug for Min and Max [\#6435](https://github.com/kokkos/kokkos/pull/6435) + - Fix implementation for `cyl_bessel_i0` [\#6484](https://github.com/kokkos/kokkos/pull/6484) + - Fix various NVCC warnings in `BinSort`, `Array`, and bit manipulation function templates [\#6483](https://github.com/kokkos/kokkos/pull/6483) + +## [4.1.00](https://github.com/kokkos/kokkos/tree/4.1.00) (2023-06-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.01...4.1.00) ### Features: From 81a9586539f4d71db4d6219aca8c2ca613be7ddf Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Fri, 17 Nov 2023 14:27:19 -0700 Subject: [PATCH 18/30] Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF (#6593) * Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF * Clean up tutorials --- core/src/Kokkos_Macros.hpp | 6 --- core/src/setup/Kokkos_Setup_SYCL.hpp | 8 ---- .../unit_test/TestBitManipulationBuiltins.hpp | 14 +++--- core/unit_test/TestMathematicalFunctions.hpp | 46 +++++++++---------- .../tutorial/01_hello_world/hello_world.cpp | 7 +-- .../hello_world_lambda.cpp | 10 ++-- .../01_thread_teams/thread_teams.cpp | 10 ++-- .../thread_teams_lambda.cpp | 16 +++---- .../nested_parallel_for.cpp | 15 ++---- 9 files changed, 47 insertions(+), 85 deletions(-) diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 3f53fcba683..a77e50b65b3 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -339,12 +339,6 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// Temporary solution for SYCL not supporting printf in kernels. -// Might disappear at any point once we have found another solution. -#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__) -#endif - //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 7f7957bc61f..72017e38d88 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -38,12 +38,4 @@ #include #endif -#ifdef __SYCL_DEVICE_ONLY__ -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ - do { \ - const __attribute__((opencl_constant)) char fmt[] = (format); \ - sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \ - } while (0) -#endif - #endif diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 092e7cff618..2f3bcfe817d 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -804,26 +804,26 @@ struct TestBitCastFunction { using Kokkos::bit_cast; if (bit_cast(123) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + Kokkos::printf("failed check #1\n"); } if (bit_cast(123u) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + Kokkos::printf("failed check #2\n"); } if (bit_cast(~0u) != ~0) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + Kokkos::printf("failed check #3\n"); } if constexpr (sizeof(int) == sizeof(float)) { if (!check(12.34f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + Kokkos::printf("failed check #4\n"); } } if constexpr (sizeof(unsigned long long) == sizeof(double)) { if (!check(123.456)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + Kokkos::printf("failed check #5\n"); } } @@ -848,11 +848,11 @@ struct TestBitCastFunction { } if (!(bit_cast(arr) == arr)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + Kokkos::printf("failed check #6\n"); } if (!(bit_cast(arr2) == arr2)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + Kokkos::printf("failed check #7\n"); } } }; diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index be479e0219a..3150a015b5d 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -1304,12 +1304,12 @@ struct TestAbsoluteValueFunction { if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n"); + Kokkos::printf("failed abs(KE::half_t)\n"); } if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n"); + Kokkos::printf("failed abs(KE::bhalf_t)\n"); } if (abs(5.) != 5. || abs(-5.) != 5.) { ++e; @@ -1360,26 +1360,26 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::fabs; if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n"); + Kokkos::printf("failed fabs(float)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n"); + Kokkos::printf("failed fabs(KE::half_t)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n"); + Kokkos::printf("failed fabs(KE::bhalf_t)\n"); } if (fabs(5.) != 5. || fabs(-5.) != 5.) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n"); + Kokkos::printf("failed fabs(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n"); + Kokkos::printf("failed fabs(long double)\n"); } #endif // special values @@ -1387,8 +1387,7 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::isnan; if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fabs(floating_point) special values\n"); + Kokkos::printf("failed fabs(floating_point) special values\n"); } static_assert(std::is_same(4.f))), @@ -1420,7 +1419,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(fmod(6.2f, 4.f), 2.2f, 1) && !compare(fmod(-6.2f, 4.f), -2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n"); + Kokkos::printf("failed fmod(float)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1429,7 +1428,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { fmod(static_cast(-6.2f), static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n"); + Kokkos::printf("failed fmod(KE::half_t)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1438,17 +1437,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n"); + Kokkos::printf("failed fmod(KE::bhalf_t)\n"); } if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n"); + Kokkos::printf("failed fmod(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(fmod(6.2l, 4.l), 2.2l, 1) && !compare(fmod(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n"); + Kokkos::printf("failed fmod(long double)\n"); } #endif // special values @@ -1457,8 +1456,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(fmod(-KE::infinity::value, 1.f)) && !isnan(fmod(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fmod(floating_point) special values\n"); + Kokkos::printf("failed fmod(floating_point) special values\n"); } static_assert(std::is_same(4.f), @@ -1494,7 +1492,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(remainder(6.2f, 4.f), 2.2f, 2) && !compare(remainder(-6.2f, 4.f), 2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n"); + Kokkos::printf("failed remainder(float)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1503,7 +1501,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n"); + Kokkos::printf("failed remainder(KE::half_t)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1512,18 +1510,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n"); + Kokkos::printf("failed remainder(KE::bhalf_t)\n"); } if (!compare(remainder(6.2, 4.), 2.2, 2) && !compare(remainder(-6.2, 4.), 2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n"); + Kokkos::printf("failed remainder(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(remainder(6.2l, 4.l), 2.2l, 1) && !compare(remainder(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n"); + Kokkos::printf("failed remainder(long double)\n"); } #endif // special values @@ -1532,7 +1530,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(remainder(-KE::infinity::value, 1.f)) && !isnan(remainder(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( + Kokkos::printf( "failed remainder(floating_point) special values\n"); } @@ -1748,7 +1746,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); + Kokkos::printf("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1758,7 +1756,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); + Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 diff --git a/example/tutorial/01_hello_world/hello_world.cpp b/example/tutorial/01_hello_world/hello_world.cpp index 5b8a21af833..22b8b6d63c8 100644 --- a/example/tutorial/01_hello_world/hello_world.cpp +++ b/example/tutorial/01_hello_world/hello_world.cpp @@ -58,12 +58,7 @@ struct hello_world { // is unnecessary but harmless. KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + Kokkos::printf("Hello from i = %i\n", i); } }; diff --git a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index c78f3076361..909765e1fc3 100644 --- a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -76,13 +76,9 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( 15, KOKKOS_LAMBDA(const int i) { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - // printf works in a CUDA parallel kernel; std::ostream does not. - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + // Kokko::printf works for all backends in a parallel kernel; + // std::ostream does not. + Kokkos::printf("Hello from i = %i\n", i); }); #endif // You must call finalize() after you are done using Kokkos. diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp index b041f8d435b..ee3f4721d91 100644 --- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp +++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -47,13 +47,9 @@ struct hello_world { // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and // the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); } }; diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 933b254f7c7..1e6812adead 100644 --- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -57,16 +57,12 @@ int main(int narg, char* args[]) { policy, KOKKOS_LAMBDA(const team_member& thread, int& lsum) { lsum += 1; - // TeamPolicy<>::member_type provides functions to query the - // multidimensional index of a thread, as well as the number of - // thread teams and the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs workaround for printf - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); }, sum); #endif diff --git a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp index 398810d1331..75d6089e9af 100644 --- a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp +++ b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -43,16 +43,11 @@ struct hello_world { // the operator using a team_policy acts like a parallel region for the // team. That means that everything outside of the nested parallel_for is // also executed by all threads of the team. - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), - [&](const int& i) { -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: (%i , %i) executed loop %i \n", - thread.league_rank(), thread.team_rank(), i); -#else - (void) i; -#endif - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, 31), [&](const int& i) { + Kokkos::printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); + }); } }; From f0af4672cabb5f7a13293bb10b8b68c5191ff735 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 08:19:29 +0100 Subject: [PATCH 19/30] try fix --- core/unit_test/TestDeviceAndThreads.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 511b182cab0..f61c850ffd9 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,6 +17,7 @@ import unittest import subprocess +import psutil PREFIX = "$" EXECUTABLE = "$" @@ -30,7 +31,9 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 4, 6, 8]: + phys_cores_count = psutil.cpu_count(logical=False) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] + for x in looplist: if x >= max_threads: break yield x From 2779b29b556948af548eb1eae6952969e0c81e17 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 09:08:02 +0100 Subject: [PATCH 20/30] avoid pyt package --- core/unit_test/TestDeviceAndThreads.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index f61c850ffd9..e246ebba9ff 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,7 +17,7 @@ import unittest import subprocess -import psutil +import os #psutil PREFIX = "$" EXECUTABLE = "$" @@ -31,7 +31,16 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - phys_cores_count = psutil.cpu_count(logical=False) + #phys_cores_count = psutil.cpu_count(logical=False) + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + if os.name == 'nt': + args = ['wmic', 'cpu', 'get', 'NumberOfCores'] + + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + phys_cores_count = int(output) + print(phys_cores_count) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] for x in looplist: if x >= max_threads: From 17af2f3c40c213958fce992ca7afb08b9ab4d963 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 09:49:07 +0100 Subject: [PATCH 21/30] try --- core/unit_test/TestDeviceAndThreads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index e246ebba9ff..538cd9c3707 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -32,7 +32,7 @@ def GetFlag(flag, *extra_args): def GetNumThreads(max_threads): #phys_cores_count = psutil.cpu_count(logical=False) - args = ['sysctl', '-n', 'hw.physicalcpu_max'] + args = ['nproc', '--all'] #'sysctl', '-n', 'hw.physicalcpu_max'] if os.name == 'nt': args = ['wmic', 'cpu', 'get', 'NumberOfCores'] From 68e4bedc43cdb7ce1419b3da21fb84e797a7a295 Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 12:28:03 +0100 Subject: [PATCH 22/30] fix for macos --- core/unit_test/TestDeviceAndThreads.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 538cd9c3707..7624b8394b5 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,7 +17,7 @@ import unittest import subprocess -import os #psutil +import platform #psutil PREFIX = "$" EXECUTABLE = "$" @@ -32,15 +32,18 @@ def GetFlag(flag, *extra_args): def GetNumThreads(max_threads): #phys_cores_count = psutil.cpu_count(logical=False) - args = ['nproc', '--all'] #'sysctl', '-n', 'hw.physicalcpu_max'] - if os.name == 'nt': + args = [] + name = platform.system() + if name == 'Darwin': + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + elif name == 'Linux': + args = ['nproc', '--all'] + else: args = ['wmic', 'cpu', 'get', 'NumberOfCores'] result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = result.stdout.decode('utf-8') phys_cores_count = int(output) - print(phys_cores_count) - looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] for x in looplist: if x >= max_threads: From 61842b7d104402ce09a2ebd59e4657237425032b Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sat, 18 Nov 2023 12:32:25 +0100 Subject: [PATCH 23/30] remove comments --- core/unit_test/TestDeviceAndThreads.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 7624b8394b5..d44af1ea3d1 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,7 +17,7 @@ import unittest import subprocess -import platform #psutil +import platform PREFIX = "$" EXECUTABLE = "$" @@ -31,7 +31,6 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - #phys_cores_count = psutil.cpu_count(logical=False) args = [] name = platform.system() if name == 'Darwin': From 33a1106da121e31a8ee84bcfaf3ccaa69556370f Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Thu, 16 Nov 2023 07:52:22 +0100 Subject: [PATCH 24/30] use reference --- core/src/impl/Kokkos_HostThreadTeam.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 51f25a8b60f..25bf5921fcf 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto team_member = loop_boundaries.thread; + auto & team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); From 374064ab75aa4fd1727781b01c8c1724470788ac Mon Sep 17 00:00:00 2001 From: Francesco Rizzi Date: Sun, 19 Nov 2023 07:55:54 +0100 Subject: [PATCH 25/30] add branching --- core/unit_test/TestDeviceAndThreads.py | 4 +++- core/unit_test/UnitTest_DeviceAndThreads.cpp | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index d44af1ea3d1..95727dad85c 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -43,7 +43,9 @@ def GetNumThreads(max_threads): result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = result.stdout.decode('utf-8') phys_cores_count = int(output) - looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \ + if GetFlag("hwloc_enabled") else [1,2,3,4,5] + for x in looplist: if x >= max_threads: break diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp index b522ac3e69b..ea944bae4cd 100644 --- a/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -68,6 +68,14 @@ int get_max_threads() { #endif } +int get_hwloc_enabled() { +#ifdef KOKKOS_ENABLE_HWLOC + return 1; +#else + return 0; +#endif +} + int get_num_threads() { int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); assert(num_threads == Kokkos::num_threads()); @@ -93,6 +101,7 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(device_count); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); + KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); #undef KOKKOS_TEST_PRINT_FLAG From 3dd0b825380048e378589546949d101263bf1f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Nov 2023 12:18:00 +0100 Subject: [PATCH 26/30] [ci skip] fix formatting --- core/src/Threads/Kokkos_Threads_Team.hpp | 2 +- core/src/impl/Kokkos_HostThreadTeam.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/Threads/Kokkos_Threads_Team.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp index 8f9614233b2..fd0f221365b 100644 --- a/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -1001,7 +1001,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } - auto & team_member = loop_bounds.thread; + auto& team_member = loop_bounds.thread; // 'scan_val' output is the exclusive prefix sum scan_val = team_member.team_scan(scan_val); diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 25bf5921fcf..25f09b82865 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto & team_member = loop_boundaries.thread; + auto& team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); From ae75d38951e2cebb3537649bd3e7ca046d285148 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 20 Nov 2023 15:13:38 -0500 Subject: [PATCH 27/30] GitHub Workflows: Use Ubuntu 22.04 instead of Fedora for Intel compiler testing --- .github/workflows/continuous-integration-workflow.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 8c226c3766c..6446cbacd9b 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -25,22 +25,22 @@ jobs: backend: ['OPENMP'] clang-tidy: [''] include: - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Debug' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Debug' From 0262f7405e06b409df1c3a6ba0c8901ae7387110 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Tue, 21 Nov 2023 08:56:15 +0000 Subject: [PATCH 28/30] nvcc(wrapper): adding missing `--generate-line-info` arg --- bin/nvcc_wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index c1400872402..1b0fda12207 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|-v|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args From a4720ce414d8b92e7ed59ec0b76dc7f1a32df176 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 22 Nov 2023 09:16:26 -0500 Subject: [PATCH 29/30] Add clang-format check to GitHub workflows (#6612) * Add clang-format check to GitHub workflows * Use DoozyX/clang-format-lint-action * Fix capitalization; remove 'exclude' --- .github/workflows/clang-format-check.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/workflows/clang-format-check.yml diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml new file mode 100644 index 00000000000..1f557dbfcdf --- /dev/null +++ b/.github/workflows/clang-format-check.yml @@ -0,0 +1,11 @@ +name: clang-format check +on: [push, pull_request] +jobs: + formatting-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run clang-format style check. + uses: DoozyX/clang-format-lint-action@v0.16.2 + with: + clangFormatVersion: 8 From f31436a0937ba7e21acce3049d1fac2550079140 Mon Sep 17 00:00:00 2001 From: "romin.tomasetti" Date: Mon, 27 Nov 2023 13:46:30 +0000 Subject: [PATCH 30/30] graph(HIP): adding inline keyword to fix #6623 --- core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index 3bde15444c7..7cc06d02fbe 100644 --- a/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -83,7 +83,7 @@ class GraphImpl { hipGraphExec_t m_graph_exec = nullptr; }; -GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(m_graph); if (m_graph_exec) { @@ -92,12 +92,12 @@ GraphImpl::~GraphImpl() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph)); } -GraphImpl::GraphImpl(Kokkos::HIP instance) +inline GraphImpl::GraphImpl(Kokkos::HIP instance) : m_execution_space(std::move(instance)) { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0)); } -void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node @@ -110,7 +110,7 @@ void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in it's policy template -void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -129,8 +129,8 @@ void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, - PredecessorRef arg_pred_ref) { +inline void GraphImpl::add_predecessor( + NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); KOKKOS_EXPECTS(pred_ptr); @@ -145,7 +145,7 @@ void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -void GraphImpl::submit() { +inline void GraphImpl::submit() { if (!m_graph_exec) { instantiate_graph(); } @@ -153,12 +153,12 @@ void GraphImpl::submit() { hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); } -Kokkos::HIP const& GraphImpl::get_execution_space() const +inline Kokkos::HIP const& GraphImpl::get_execution_space() const noexcept { return m_execution_space; } -auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(m_graph); KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), @@ -172,7 +172,7 @@ auto GraphImpl::create_root_node_ptr() { } template -auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { +inline auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for // each predecessor ref, so all we need to do here is create the (trivial)