diff --git a/CMakeLists.txt b/CMakeLists.txt index 81afe249..78ab771b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -447,9 +447,7 @@ if(${RUN_HIT} EQUAL 0) execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) endif() if(HIP_CATCH_TEST EQUAL "1") - enable_testing() - set(HIP_PATH ${HIP_ROOT_DIR}) - add_subdirectory(${HIP_COMMON_DIR}/tests/catch ${PROJECT_BINARY_DIR}/catch) + message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ") else() if(${RUN_HIT} EQUAL 0) set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH}) diff --git a/hip-config.cmake.in b/hip-config.cmake.in index 7c4fe7f9..b96803ee 100755 --- a/hip-config.cmake.in +++ b/hip-config.cmake.in @@ -206,16 +206,6 @@ if(HIP_COMPILER STREQUAL "clang") if (NOT compilePropIsSet) hip_add_interface_compile_flags(hip::device -x hip) - if (NOT EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode) - # This path is to support an older build of the device library - # TODO: To be removed in the future. - if(WIN32) - hip_add_interface_compile_flags(hip::device -fms-extensions -fms-compatibility) - hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${HIP_PATH}/lib/bitcode\") - else() - hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${AMD_DEVICE_LIBS_PREFIX}/lib\") - endif() - endif() endif() hip_add_interface_link_flags(hip::device --hip-link) diff --git a/include/hip/amd_detail/amd_device_functions.h b/include/hip/amd_detail/amd_device_functions.h index ce421c63..57576f57 100644 --- a/include/hip/amd_detail/amd_device_functions.h +++ b/include/hip/amd_detail/amd_device_functions.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -444,7 +444,8 @@ __device__ static inline unsigned long long int __double2ull_ru(double x) { __device__ static inline unsigned long long int __double2ull_rz(double x) { return (unsigned long long int)x; } - +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" __device__ static inline long long int __double_as_longlong(double x) { static_assert(sizeof(long long) == sizeof(double), ""); @@ -453,6 +454,7 @@ __device__ static inline long long int __double_as_longlong(double x) { return tmp; } +#pragma clang diagnostic pop /* __device__ unsigned short __float2half_rn(float x); @@ -775,21 +777,21 @@ __device__ inline static void __threadfence() { - __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); } __device__ inline static void __threadfence_block() { - __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); } __device__ inline static void __threadfence_system() { - __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); } // abort @@ -862,14 +864,11 @@ void __assertfail() } #endif /* defined(_WIN32) || defined(_WIN64) */ -__device__ -inline -static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope) -{ +__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) { if (flags) { - __atomic_work_item_fence(flags, __memory_order_release, scope); + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); __builtin_amdgcn_s_barrier(); - __atomic_work_item_fence(flags, __memory_order_acquire, scope); + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); } else { __builtin_amdgcn_s_barrier(); } @@ -879,7 +878,7 @@ __device__ inline static void __barrier(int n) { - __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group); + __work_group_barrier((__cl_mem_fence_flags)n); } __device__ @@ -922,7 +921,7 @@ int __syncthreads_or(int predicate) PIPE_ID 7:6 Pipeline from which the wave was dispatched. CU_ID 11:8 Compute Unit the wave is assigned to. SH_ID 12 Shader Array (within an SE) the wave is assigned to. - SE_ID 14:13 Shader Engine the wave is assigned to. + SE_ID 15:13 Shader Engine the wave is assigned to. TG_ID 19:16 Thread-group ID VM_ID 23:20 Virtual Memory ID QUEUE_ID 26:24 Queue from which this wave was dispatched. @@ -935,7 +934,7 @@ int __syncthreads_or(int predicate) #define HW_ID_CU_ID_SIZE 4 #define HW_ID_CU_ID_OFFSET 8 -#define HW_ID_SE_ID_SIZE 2 +#define HW_ID_SE_ID_SIZE 3 #define HW_ID_SE_ID_OFFSET 13 /* diff --git a/include/hip/amd_detail/amd_hip_bfloat16.h b/include/hip/amd_detail/amd_hip_bfloat16.h index 8c7f7839..deb3bfb7 100644 --- a/include/hip/amd_detail/amd_hip_bfloat16.h +++ b/include/hip/amd_detail/amd_hip_bfloat16.h @@ -32,10 +32,8 @@ #include "host_defines.h" #if defined(__HIPCC_RTC__) #define __HOST_DEVICE__ __device__ - #define HIP_OSTREAM __hip_internal::ostream #else #define __HOST_DEVICE__ __host__ __device__ - #define HIP_OSTREAM std::ostream #endif #if __cplusplus < 201103L || !defined(__HIPCC__) @@ -181,12 +179,12 @@ static_assert(__hip_internal::is_trivial{}, static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public) && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data), "internal hip_bfloat16 does not match public hip_bfloat16"); -#endif -inline HIP_OSTREAM& operator<<(HIP_OSTREAM& os, const hip_bfloat16& bf16) +inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16) { - return os << bf16; + return os << float(bf16); } +#endif inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a) { diff --git a/include/hip/amd_detail/amd_hip_complex.h b/include/hip/amd_detail/amd_hip_complex.h index eba6eb53..9d9dfd5e 100644 --- a/include/hip/amd_detail/amd_hip_complex.h +++ b/include/hip/amd_detail/amd_hip_complex.h @@ -106,15 +106,20 @@ THE SOFTWARE. return lhs; \ } -#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ - __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ - lhs = lhs * rhs; \ - return lhs; \ +#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ + __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ + type temp{lhs}; \ + lhs.x = rhs.x * temp.x - rhs.y * temp.y; \ + lhs.y = rhs.y * temp.x + rhs.x * temp.y; \ + return lhs; \ } #define COMPLEX_DIV_PREOP_OVERLOAD(type) \ - __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ - lhs = lhs / rhs; \ + __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ + type temp; \ + temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ + temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ + lhs = temp; \ return lhs; \ } diff --git a/include/hip/amd_detail/amd_hip_cooperative_groups.h b/include/hip/amd_detail/amd_hip_cooperative_groups.h index 747f65a4..575a9f8e 100644 --- a/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -32,6 +32,13 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc++98-compat" +#pragma clang diagnostic ignored "-Wsign-conversion" +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wpadded" + #if __cplusplus #if !defined(__HIPCC_RTC__) #include @@ -69,8 +76,8 @@ class thread_group { // only when the group is supposed to contain only the calling the thread // (throurh the API - `this_thread()`), and in all other cases, this thread // group object is a sub-object of some other derived thread group object - __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = (uint64_t)0, - uint64_t mask = (uint64_t)0) { + __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast(0), + uint64_t mask = static_cast(0)) { _type = type; _size = size; _mask = mask; @@ -199,7 +206,7 @@ class thread_block : public thread_group { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); // Invalid tile size, assert if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { - __hip_assert(false && "invalid tile size"); + __hip_assert(false && "invalid tile size") } thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); @@ -246,7 +253,7 @@ class tiled_group : public thread_group { const bool pow2 = ((tile_size & (tile_size - 1)) == 0); if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { - __hip_assert(false && "invalid tile size"); + __hip_assert(false && "invalid tile size") } if (size() <= tile_size) { @@ -282,7 +289,6 @@ class tiled_group : public thread_group { * \details Represents a active thread group in a wavefront. * This group type also supports sub-wave level intrinsics. */ - class coalesced_group : public thread_group { private: friend __CG_QUALIFIER__ coalesced_group coalesced_threads(); @@ -300,8 +306,8 @@ class coalesced_group : public thread_group { // prepare a mask for further partitioning it so that it stays coalesced. if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); - unsigned int masklength = min((unsigned int)size() - base_offset, tile_size); - lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); + unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); + lane_mask member_mask = static_cast(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength); member_mask <<= (__lane_id() & ~(tile_size - 1)); coalesced_group coalesced_tile = coalesced_group(member_mask); @@ -358,7 +364,7 @@ class coalesced_group : public thread_group { __CG_QUALIFIER__ T shfl(T var, int srcRank) const { static_assert(is_valid_type::value, "Neither an integer or float type."); - srcRank = srcRank % size(); + srcRank = srcRank % static_cast(size()); int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank : (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) @@ -452,7 +458,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { return (static_cast(this)->thread_rank()); } default: { - __hip_assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type") return -1; } } @@ -476,7 +482,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const { return (static_cast(this)->is_valid()); } default: { - __hip_assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type") return false; } } @@ -505,7 +511,7 @@ __CG_QUALIFIER__ void thread_group::sync() const { break; } default: { - __hip_assert(false && "invalid cooperative group type"); + __hip_assert(false && "invalid cooperative group type") } } } @@ -697,6 +703,6 @@ __CG_QUALIFIER__ thread_block_tile tiled_partition(const Paren return impl::tiled_partition_internal(g); } } // namespace cooperative_groups - +#pragma clang diagnostic pop #endif // __cplusplus #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h index fb07cfb6..fa58c3e4 100644 --- a/include/hip/amd_detail/amd_hip_fp16.h +++ b/include/hip/amd_detail/amd_hip_fp16.h @@ -1569,7 +1569,7 @@ THE SOFTWARE. __half hrcp(__half x) { return __half_raw{ - __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)}; + static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; } inline __device__ @@ -1672,7 +1672,10 @@ THE SOFTWARE. __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } inline __HOST_DEVICE__ - __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); } + __half2 h2rcp(__half2 x) { + return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), + static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))}; + } inline __HOST_DEVICE__ __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } @@ -1707,6 +1710,66 @@ THE SOFTWARE. using half = __half; using half2 = __half2; #endif + #if !defined(__HIPCC_RTC__) + #include "amd_device_functions.h" + #include "amd_warp_functions.h" + __device__ + inline + __half __shfl(__half var, int src_lane, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl(__half2 var, int src_lane, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl(tmp.i, src_lane, width); + return tmp.h; + } + __device__ + inline + __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_up(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_down(tmp.i, lane_delta, width); + return tmp.h; + } + __device__ + inline + __half __shfl_xor(__half var, int lane_mask, int width = warpSize) { + union { int i; __half h; } tmp; tmp.h = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.h; + } + __device__ + inline + __half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) { + union { int i; __half2 h; } tmp; tmp.h = var; + tmp.i = __shfl_xor(tmp.i, lane_mask, width); + return tmp.h; + } + #endif // !defined(__HIPCC_RTC__) #endif // defined(__cplusplus) #elif defined(__GNUC__) #include "hip_fp16_gcc.h" diff --git a/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/include/hip/amd_detail/amd_hip_unsafe_atomics.h index 243b5a64..0100e99e 100644 --- a/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2021 - Present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,9 @@ THE SOFTWARE. #pragma once #ifdef __cplusplus +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + /** * @brief Unsafe floating point rmw atomic add. * @@ -175,7 +178,7 @@ __device__ inline float unsafeAtomicMin(float* addr, float val) { * @return Original value contained in \p addr. */ __device__ inline double unsafeAtomicAdd(double* addr, double value) { -#if (defined(__gfx90a__) || defined(__gfx940_)) && \ +#if (defined(__gfx90a__) || defined(__gfx940__)) && \ __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value); #elif defined (__hip_atomic_fetch_add) @@ -307,7 +310,7 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) { */ __device__ inline float safeAtomicAdd(float* addr, float value) { #if defined(__gfx908__) || \ - (defined(__gfx90a) && !__has_builtin(__hip_atomic_fetch_add)) + (defined(__gfx90a__) && !__has_builtin(__hip_atomic_fetch_add)) // On gfx908, we can generate unsafe FP32 atomic add that does not follow all // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to @@ -563,4 +566,5 @@ __device__ inline double safeAtomicMin(double* addr, double val) { #endif } +#pragma clang diagnostic pop #endif diff --git a/include/hip/amd_detail/amd_hip_vector_types.h b/include/hip/amd_detail/amd_hip_vector_types.h index dfd3b39a..8215fb02 100644 --- a/include/hip/amd_detail/amd_hip_vector_types.h +++ b/include/hip/amd_detail/amd_hip_vector_types.h @@ -544,6 +544,13 @@ template struct is_scalar : public integral_constant struct is_scalar : public integral_constant struct is_scalar : public integral_constant{x} -= y; } - template - __HOST_DEVICE__ - inline - constexpr - HIP_vector_type operator*( - const HIP_vector_type& x, const HIP_vector_type& y) noexcept - { - return HIP_vector_type{x} *= y; - } template __HOST_DEVICE__ inline @@ -737,15 +741,6 @@ template struct is_scalar : public integral_constant{x} *= y; } - template - __HOST_DEVICE__ - inline - constexpr - HIP_vector_type operator/( - const HIP_vector_type& x, const HIP_vector_type& y) noexcept - { - return HIP_vector_type{x} /= y; - } template __HOST_DEVICE__ inline diff --git a/include/hip/amd_detail/amd_math_functions.h b/include/hip/amd_detail/amd_math_functions.h index 471f6ee3..3c17d298 100644 --- a/include/hip/amd_detail/amd_math_functions.h +++ b/include/hip/amd_detail/amd_math_functions.h @@ -640,22 +640,22 @@ inline float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); } __DEVICE__ inline -float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_rd(float x) { return __builtin_amdgcn_rcpf(x); } #endif __DEVICE__ inline -float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_rn(float x) { return __builtin_amdgcn_rcpf(x); } #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ inline -float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_ru(float x) { return __builtin_amdgcn_rcpf(x); } __DEVICE__ inline -float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); } +float __frcp_rz(float x) { return __builtin_amdgcn_rcpf(x); } #endif __DEVICE__ inline -float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); } +float __frsqrt_rn(float x) { return __builtin_amdgcn_rsqf(x); } #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ inline @@ -1155,18 +1155,18 @@ inline double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); } __DEVICE__ inline -double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_rd(double x) { return __builtin_amdgcn_rcp(x); } #endif __DEVICE__ inline -double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_rn(double x) { return __builtin_amdgcn_rcp(x); } #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ inline -double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_ru(double x) { return __builtin_amdgcn_rcp(x); } __DEVICE__ inline -double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); } +double __drcp_rz(double x) { return __builtin_amdgcn_rcp(x); } __DEVICE__ inline double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); } diff --git a/include/hip/amd_detail/amd_warp_functions.h b/include/hip/amd_detail/amd_warp_functions.h index b18ff5f5..fb6065b1 100644 --- a/include/hip/amd_detail/amd_warp_functions.h +++ b/include/hip/amd_detail/amd_warp_functions.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,14 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wsign-conversion" +#pragma clang diagnostic ignored "-Wold-style-cast" +#pragma clang diagnostic ignored "-Wc++98-compat" +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" + __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i); @@ -491,4 +499,5 @@ unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = return tmp1; } +#pragma clang diagnostic pop #endif diff --git a/include/hip/amd_detail/device_library_decls.h b/include/hip/amd_detail/device_library_decls.h index 8add4fa2..02228705 100644 --- a/include/hip/amd_detail/device_library_decls.h +++ b/include/hip/amd_detail/device_library_decls.h @@ -128,26 +128,4 @@ __device__ inline static __local void* __to_local(unsigned x) { return (__local #define __CLK_LOCAL_MEM_FENCE 0x01 typedef unsigned __cl_mem_fence_flags; -typedef enum __memory_scope { - __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, - __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, - __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -} __memory_scope; - -// enum values aligned with what clang uses in EmitAtomicExpr() -typedef enum __memory_order -{ - __memory_order_relaxed = __ATOMIC_RELAXED, - __memory_order_acquire = __ATOMIC_ACQUIRE, - __memory_order_release = __ATOMIC_RELEASE, - __memory_order_acq_rel = __ATOMIC_ACQ_REL, - __memory_order_seq_cst = __ATOMIC_SEQ_CST -} __memory_order; - -// Linked from hip.amdgcn.bc -extern "C" __device__ void -__atomic_work_item_fence(__cl_mem_fence_flags, __memory_order, __memory_scope); - #endif diff --git a/include/hip/amd_detail/hip_cooperative_groups_helper.h b/include/hip/amd_detail/hip_cooperative_groups_helper.h index a90f0a3a..877c6a43 100644 --- a/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -39,6 +39,12 @@ THE SOFTWARE. #define __align__(x) __attribute__((aligned(x))) #endif +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" +#pragma clang diagnostic ignored "-Wc++98-compat" +#pragma clang diagnostic ignored "-Wc++98-compat-pedantic" +#pragma clang diagnostic ignored "-Wshorten-64-to-32" + #if !defined(__CG_QUALIFIER__) #define __CG_QUALIFIER__ __device__ __forceinline__ #endif @@ -92,15 +98,18 @@ typedef enum { */ namespace multi_grid { -__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); } +__CG_STATIC_QUALIFIER__ uint32_t num_grids() { + return static_cast(__ockl_multi_grid_num_grids()); } -__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); } +__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { + return static_cast(__ockl_multi_grid_grid_rank()); } -__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); } +__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast(__ockl_multi_grid_size()); } -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); } +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return static_cast(__ockl_multi_grid_thread_rank()); } -__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); } +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_multi_grid_is_valid()); } __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } @@ -112,28 +121,28 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } namespace grid { __CG_STATIC_QUALIFIER__ uint32_t size() { - return (uint32_t)((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * + return static_cast((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * (blockDim.x * gridDim.x)); } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { // Compute global id of the workgroup to which the current thread belongs to - uint32_t blkIdx = (uint32_t)((blockIdx.z * gridDim.y * gridDim.x) + + uint32_t blkIdx = static_cast((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x) + (blockIdx.x)); // Compute total number of threads being passed to reach current workgroup // within grid uint32_t num_threads_till_current_workgroup = - (uint32_t)(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); + static_cast(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); // Compute thread local rank within current workgroup - uint32_t local_thread_rank = (uint32_t)((threadIdx.z * blockDim.y * blockDim.x) + + uint32_t local_thread_rank = static_cast((threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x)); return (num_threads_till_current_workgroup + local_thread_rank); } -__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); } +__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_is_valid()); } __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } @@ -146,19 +155,21 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } namespace workgroup { __CG_STATIC_QUALIFIER__ dim3 group_index() { - return (dim3((uint32_t)blockIdx.x, (uint32_t)blockIdx.y, (uint32_t)blockIdx.z)); + return (dim3(static_cast(blockIdx.x), static_cast(blockIdx.y), + static_cast(blockIdx.z))); } __CG_STATIC_QUALIFIER__ dim3 thread_index() { - return (dim3((uint32_t)threadIdx.x, (uint32_t)threadIdx.y, (uint32_t)threadIdx.z)); + return (dim3(static_cast(threadIdx.x), static_cast(threadIdx.y), + static_cast(threadIdx.z))); } __CG_STATIC_QUALIFIER__ uint32_t size() { - return ((uint32_t)(blockDim.x * blockDim.y * blockDim.z)); + return (static_cast(blockDim.x * blockDim.y * blockDim.z)); } __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return ((uint32_t)((threadIdx.z * blockDim.y * blockDim.x) + + return (static_cast((threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x))); } @@ -187,8 +198,8 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, " // // For each thread, this function returns the number of active threads which // have i-th bit of x set and come before the current thread. -__device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { - int counter=0; +__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { + unsigned int counter=0; #if __AMDGCN_WAVEFRONT_SIZE == 32 counter = __builtin_amdgcn_mbcnt_lo(x, add); #else @@ -206,5 +217,6 @@ __device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) { } // namespace cooperative_groups +#pragma clang diagnostic pop #endif // __cplusplus #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H diff --git a/include/hip/amd_detail/hip_fp16_math_fwd.h b/include/hip/amd_detail/hip_fp16_math_fwd.h index 36942c1a..caf6ec75 100644 --- a/include/hip/amd_detail/hip_fp16_math_fwd.h +++ b/include/hip/amd_detail/hip_fp16_math_fwd.h @@ -44,7 +44,6 @@ extern "C" __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16); __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int); - __device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); __device__ _Float16 __ocml_sin_f16(_Float16); @@ -73,11 +72,6 @@ extern "C" __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16); __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16); __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); - __device__ inline - __2f16 __llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL. - { - return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)}; - } __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); __device__ __2f16 __ocml_sin_2f16(__2f16); diff --git a/include/hip/amd_detail/hip_prof_str.h b/include/hip/amd_detail/hip_prof_str.h index 7ec70fdc..d0b24d01 100644 --- a/include/hip/amd_detail/hip_prof_str.h +++ b/include/hip/amd_detail/hip_prof_str.h @@ -370,10 +370,12 @@ enum hip_api_id_t { HIP_API_ID_hipGraphMemFreeNodeGetParams = 357, HIP_API_ID_hipModuleLaunchCooperativeKernel = 358, HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice = 359, - HIP_API_ID_LAST = 359, + HIP_API_ID_hipArray3DGetDescriptor = 360, + HIP_API_ID_hipArrayGetDescriptor = 361, + HIP_API_ID_hipArrayGetInfo = 362, + HIP_API_ID_hipStreamGetDevice = 363, + HIP_API_ID_LAST = 363, - HIP_API_ID_hipArray3DGetDescriptor = HIP_API_ID_NONE, - HIP_API_ID_hipArrayGetDescriptor = HIP_API_ID_NONE, HIP_API_ID_hipBindTexture = HIP_API_ID_NONE, HIP_API_ID_hipBindTexture2D = HIP_API_ID_NONE, HIP_API_ID_hipBindTextureToArray = HIP_API_ID_NONE, @@ -420,8 +422,11 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID___hipPopCallConfiguration: return "__hipPopCallConfiguration"; case HIP_API_ID___hipPushCallConfiguration: return "__hipPushCallConfiguration"; case HIP_API_ID_hipArray3DCreate: return "hipArray3DCreate"; + case HIP_API_ID_hipArray3DGetDescriptor: return "hipArray3DGetDescriptor"; case HIP_API_ID_hipArrayCreate: return "hipArrayCreate"; case HIP_API_ID_hipArrayDestroy: return "hipArrayDestroy"; + case HIP_API_ID_hipArrayGetDescriptor: return "hipArrayGetDescriptor"; + case HIP_API_ID_hipArrayGetInfo: return "hipArrayGetInfo"; case HIP_API_ID_hipChooseDevice: return "hipChooseDevice"; case HIP_API_ID_hipConfigureCall: return "hipConfigureCall"; case HIP_API_ID_hipCreateSurfaceObject: return "hipCreateSurfaceObject"; @@ -739,6 +744,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipStreamEndCapture: return "hipStreamEndCapture"; case HIP_API_ID_hipStreamGetCaptureInfo: return "hipStreamGetCaptureInfo"; case HIP_API_ID_hipStreamGetCaptureInfo_v2: return "hipStreamGetCaptureInfo_v2"; + case HIP_API_ID_hipStreamGetDevice: return "hipStreamGetDevice"; case HIP_API_ID_hipStreamGetFlags: return "hipStreamGetFlags"; case HIP_API_ID_hipStreamGetPriority: return "hipStreamGetPriority"; case HIP_API_ID_hipStreamIsCapturing: return "hipStreamIsCapturing"; @@ -782,8 +788,11 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("__hipPopCallConfiguration", name) == 0) return HIP_API_ID___hipPopCallConfiguration; if (strcmp("__hipPushCallConfiguration", name) == 0) return HIP_API_ID___hipPushCallConfiguration; if (strcmp("hipArray3DCreate", name) == 0) return HIP_API_ID_hipArray3DCreate; + if (strcmp("hipArray3DGetDescriptor", name) == 0) return HIP_API_ID_hipArray3DGetDescriptor; if (strcmp("hipArrayCreate", name) == 0) return HIP_API_ID_hipArrayCreate; if (strcmp("hipArrayDestroy", name) == 0) return HIP_API_ID_hipArrayDestroy; + if (strcmp("hipArrayGetDescriptor", name) == 0) return HIP_API_ID_hipArrayGetDescriptor; + if (strcmp("hipArrayGetInfo", name) == 0) return HIP_API_ID_hipArrayGetInfo; if (strcmp("hipChooseDevice", name) == 0) return HIP_API_ID_hipChooseDevice; if (strcmp("hipConfigureCall", name) == 0) return HIP_API_ID_hipConfigureCall; if (strcmp("hipCreateSurfaceObject", name) == 0) return HIP_API_ID_hipCreateSurfaceObject; @@ -1101,6 +1110,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipStreamEndCapture", name) == 0) return HIP_API_ID_hipStreamEndCapture; if (strcmp("hipStreamGetCaptureInfo", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo; if (strcmp("hipStreamGetCaptureInfo_v2", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo_v2; + if (strcmp("hipStreamGetDevice", name) == 0) return HIP_API_ID_hipStreamGetDevice; if (strcmp("hipStreamGetFlags", name) == 0) return HIP_API_ID_hipStreamGetFlags; if (strcmp("hipStreamGetPriority", name) == 0) return HIP_API_ID_hipStreamGetPriority; if (strcmp("hipStreamIsCapturing", name) == 0) return HIP_API_ID_hipStreamIsCapturing; @@ -1164,6 +1174,12 @@ typedef struct hip_api_data_s { const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray; HIP_ARRAY3D_DESCRIPTOR pAllocateArray__val; } hipArray3DCreate; + struct { + HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor; + HIP_ARRAY3D_DESCRIPTOR pArrayDescriptor__val; + hipArray* array; + hipArray array__val; + } hipArray3DGetDescriptor; struct { hipArray** pHandle; hipArray* pHandle__val; @@ -1174,6 +1190,22 @@ typedef struct hip_api_data_s { hipArray* array; hipArray array__val; } hipArrayDestroy; + struct { + HIP_ARRAY_DESCRIPTOR* pArrayDescriptor; + HIP_ARRAY_DESCRIPTOR pArrayDescriptor__val; + hipArray* array; + hipArray array__val; + } hipArrayGetDescriptor; + struct { + hipChannelFormatDesc* desc; + hipChannelFormatDesc desc__val; + hipExtent* extent; + hipExtent extent__val; + unsigned int* flags; + unsigned int flags__val; + hipArray* array; + hipArray array__val; + } hipArrayGetInfo; struct { int* device; int device__val; @@ -3033,6 +3065,11 @@ typedef struct hip_api_data_s { size_t* numDependencies_out; size_t numDependencies_out__val; } hipStreamGetCaptureInfo_v2; + struct { + hipStream_t stream; + hipDevice_t* device; + hipDevice_t device__val; + } hipStreamGetDevice; struct { hipStream_t stream; unsigned int* flags; @@ -3252,6 +3289,11 @@ typedef struct hip_api_data_s { cb_data.args.hipArray3DCreate.array = (hipArray**)array; \ cb_data.args.hipArray3DCreate.pAllocateArray = (const HIP_ARRAY3D_DESCRIPTOR*)pAllocateArray; \ }; +// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] +#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipArray3DGetDescriptor.pArrayDescriptor = (HIP_ARRAY3D_DESCRIPTOR*)pArrayDescriptor; \ + cb_data.args.hipArray3DGetDescriptor.array = (hipArray*)array; \ +}; // hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')] #define INIT_hipArrayCreate_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipArrayCreate.pHandle = (hipArray**)array; \ @@ -3261,6 +3303,18 @@ typedef struct hip_api_data_s { #define INIT_hipArrayDestroy_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipArrayDestroy.array = (hipArray*)array; \ }; +// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] +#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipArrayGetDescriptor.pArrayDescriptor = (HIP_ARRAY_DESCRIPTOR*)pArrayDescriptor; \ + cb_data.args.hipArrayGetDescriptor.array = (hipArray*)array; \ +}; +// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray*', 'array')] +#define INIT_hipArrayGetInfo_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipArrayGetInfo.desc = (hipChannelFormatDesc*)desc; \ + cb_data.args.hipArrayGetInfo.extent = (hipExtent*)extent; \ + cb_data.args.hipArrayGetInfo.flags = (unsigned int*)flags; \ + cb_data.args.hipArrayGetInfo.array = (hipArray*)array; \ +}; // hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')] #define INIT_hipChooseDevice_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipChooseDevice.device = (int*)device; \ @@ -5185,6 +5239,11 @@ typedef struct hip_api_data_s { cb_data.args.hipStreamGetCaptureInfo_v2.dependencies_out = (const hipGraphNode_t**)dependencies_out; \ cb_data.args.hipStreamGetCaptureInfo_v2.numDependencies_out = (size_t*)numDependencies_out; \ }; +// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')] +#define INIT_hipStreamGetDevice_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipStreamGetDevice.stream = (hipStream_t)stream; \ + cb_data.args.hipStreamGetDevice.device = (hipDevice_t*)device; \ +}; // hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')] #define INIT_hipStreamGetFlags_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipStreamGetFlags.stream = (hipStream_t)stream; \ @@ -5376,10 +5435,6 @@ typedef struct hip_api_data_s { #define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data) // Macros for non-public API primitives -// hipArray3DGetDescriptor() -#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) {}; -// hipArrayGetDescriptor() -#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) {}; // hipBindTexture() #define INIT_hipBindTexture_CB_ARGS_DATA(cb_data) {}; // hipBindTexture2D() @@ -5478,6 +5533,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipArray3DCreate.array) data->args.hipArray3DCreate.array__val = *(data->args.hipArray3DCreate.array); if (data->args.hipArray3DCreate.pAllocateArray) data->args.hipArray3DCreate.pAllocateArray__val = *(data->args.hipArray3DCreate.pAllocateArray); break; +// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] + case HIP_API_ID_hipArray3DGetDescriptor: + if (data->args.hipArray3DGetDescriptor.pArrayDescriptor) data->args.hipArray3DGetDescriptor.pArrayDescriptor__val = *(data->args.hipArray3DGetDescriptor.pArrayDescriptor); + if (data->args.hipArray3DGetDescriptor.array) data->args.hipArray3DGetDescriptor.array__val = *(data->args.hipArray3DGetDescriptor.array); + break; // hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')] case HIP_API_ID_hipArrayCreate: if (data->args.hipArrayCreate.pHandle) data->args.hipArrayCreate.pHandle__val = *(data->args.hipArrayCreate.pHandle); @@ -5487,6 +5547,18 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { case HIP_API_ID_hipArrayDestroy: if (data->args.hipArrayDestroy.array) data->args.hipArrayDestroy.array__val = *(data->args.hipArrayDestroy.array); break; +// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')] + case HIP_API_ID_hipArrayGetDescriptor: + if (data->args.hipArrayGetDescriptor.pArrayDescriptor) data->args.hipArrayGetDescriptor.pArrayDescriptor__val = *(data->args.hipArrayGetDescriptor.pArrayDescriptor); + if (data->args.hipArrayGetDescriptor.array) data->args.hipArrayGetDescriptor.array__val = *(data->args.hipArrayGetDescriptor.array); + break; +// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray*', 'array')] + case HIP_API_ID_hipArrayGetInfo: + if (data->args.hipArrayGetInfo.desc) data->args.hipArrayGetInfo.desc__val = *(data->args.hipArrayGetInfo.desc); + if (data->args.hipArrayGetInfo.extent) data->args.hipArrayGetInfo.extent__val = *(data->args.hipArrayGetInfo.extent); + if (data->args.hipArrayGetInfo.flags) data->args.hipArrayGetInfo.flags__val = *(data->args.hipArrayGetInfo.flags); + if (data->args.hipArrayGetInfo.array) data->args.hipArrayGetInfo.array__val = *(data->args.hipArrayGetInfo.array); + break; // hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')] case HIP_API_ID_hipChooseDevice: if (data->args.hipChooseDevice.device) data->args.hipChooseDevice.device__val = *(data->args.hipChooseDevice.device); @@ -6706,6 +6778,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipStreamGetCaptureInfo_v2.dependencies_out) data->args.hipStreamGetCaptureInfo_v2.dependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.dependencies_out); if (data->args.hipStreamGetCaptureInfo_v2.numDependencies_out) data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.numDependencies_out); break; +// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')] + case HIP_API_ID_hipStreamGetDevice: + if (data->args.hipStreamGetDevice.device) data->args.hipStreamGetDevice.device__val = *(data->args.hipStreamGetDevice.device); + break; // hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')] case HIP_API_ID_hipStreamGetFlags: if (data->args.hipStreamGetFlags.flags) data->args.hipStreamGetFlags.flags__val = *(data->args.hipStreamGetFlags.flags); @@ -6881,6 +6957,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << ", pAllocateArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DCreate.pAllocateArray__val); } oss << ")"; break; + case HIP_API_ID_hipArray3DGetDescriptor: + oss << "hipArray3DGetDescriptor("; + if (data->args.hipArray3DGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL"; + else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.pArrayDescriptor__val); } + if (data->args.hipArray3DGetDescriptor.array == NULL) oss << ", array=NULL"; + else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.array__val); } + oss << ")"; + break; case HIP_API_ID_hipArrayCreate: oss << "hipArrayCreate("; if (data->args.hipArrayCreate.pHandle == NULL) oss << "pHandle=NULL"; @@ -6895,6 +6979,26 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayDestroy.array__val); } oss << ")"; break; + case HIP_API_ID_hipArrayGetDescriptor: + oss << "hipArrayGetDescriptor("; + if (data->args.hipArrayGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL"; + else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.pArrayDescriptor__val); } + if (data->args.hipArrayGetDescriptor.array == NULL) oss << ", array=NULL"; + else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.array__val); } + oss << ")"; + break; + case HIP_API_ID_hipArrayGetInfo: + oss << "hipArrayGetInfo("; + if (data->args.hipArrayGetInfo.desc == NULL) oss << "desc=NULL"; + else { oss << "desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.desc__val); } + if (data->args.hipArrayGetInfo.extent == NULL) oss << ", extent=NULL"; + else { oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.extent__val); } + if (data->args.hipArrayGetInfo.flags == NULL) oss << ", flags=NULL"; + else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.flags__val); } + if (data->args.hipArrayGetInfo.array == NULL) oss << ", array=NULL"; + else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.array__val); } + oss << ")"; + break; case HIP_API_ID_hipChooseDevice: oss << "hipChooseDevice("; if (data->args.hipChooseDevice.device == NULL) oss << "device=NULL"; @@ -9404,6 +9508,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << ", numDependencies_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val); } oss << ")"; break; + case HIP_API_ID_hipStreamGetDevice: + oss << "hipStreamGetDevice("; + oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.stream); + if (data->args.hipStreamGetDevice.device == NULL) oss << ", device=NULL"; + else { oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.device__val); } + oss << ")"; + break; case HIP_API_ID_hipStreamGetFlags: oss << "hipStreamGetFlags("; oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetFlags.stream); diff --git a/include/hip/amd_detail/math_fwd.h b/include/hip/amd_detail/math_fwd.h index 050c88c1..9e999268 100644 --- a/include/hip/amd_detail/math_fwd.h +++ b/include/hip/amd_detail/math_fwd.h @@ -386,19 +386,6 @@ float __ocml_fma_rtp_f32(float, float, float); __device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float); - -__device__ -__attribute__((const)) -float __llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32"); -__device__ -__attribute__((const)) -float __llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32"); -__device__ -__attribute__((const)) -float __llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32"); -__device__ -__attribute__((const)) -float __llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32"); // END INTRINSICS // END FLOAT @@ -697,13 +684,6 @@ double __ocml_fma_rtp_f64(double, double, double); __device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double, double); - -__device__ -__attribute__((const)) -double __llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64"); -__device__ -__attribute__((const)) -double __llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64"); // END INTRINSICS // END DOUBLE diff --git a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h index 0c492b7c..4c8be9af 100644 --- a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h +++ b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h @@ -39,6 +39,7 @@ THE SOFTWARE. #define CUDA_11030 11030 #define CUDA_11040 11040 #define CUDA_11060 11060 +#define CUDA_12000 12000 #ifdef __cplusplus extern "C" { @@ -2506,6 +2507,20 @@ inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallb cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags)); } +inline static hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) { + hipCtx_t context; + auto err = hipCUResultTohipError(cuStreamGetCtx(stream, &context)); + if (err != hipSuccess) return err; + + err = hipCUResultTohipError(cuCtxPushCurrent(context)); + if (err != hipSuccess) return err; + + err = hipCUResultTohipError(cuCtxGetDevice(device)); + if (err != hipSuccess) return err; + + return hipCUResultTohipError(cuCtxPopCurrent(&context)); +} + inline static hipError_t hipDriverGetVersion(int* driverVersion) { return hipCUDAErrorTohipError(cudaDriverGetVersion(driverVersion)); } @@ -2773,6 +2788,7 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); } +#if CUDA_VERSION < CUDA_12000 __HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset, struct textureReference* tex, const void* devPtr, @@ -2786,6 +2802,8 @@ __HIP_DEPRECATED inline static hipError_t hipBindTexture2D( const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) { return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch)); } +#endif // CUDA_VERSION < CUDA_12000 + inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) { @@ -2818,10 +2836,12 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject)); } +#if CUDA_VERSION < CUDA_12000 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset( size_t* offset, const struct textureReference* texref) { return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref)); } +#endif inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array) { @@ -3067,6 +3087,7 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( blockSize, dynamicSMemSize, flags)); } +#if CUDA_VERSION < CUDA_12000 template inline static hipError_t hipBindTexture(size_t* offset, const struct texture& tex, const void* devPtr, size_t size = UINT_MAX) { @@ -3109,6 +3130,7 @@ __HIP_DEPRECATED inline static hipError_t hipBindTextureToArray( struct texture& tex, hipArray_const_t array) { return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array)); } +#endif // CUDA_VERSION < CUDA_12000 template inline static hipChannelFormatDesc hipCreateChannelDesc() { @@ -3186,6 +3208,21 @@ inline static hipError_t hipArray3DCreate(hiparray* pHandle, return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray)); } +inline static hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, hipExtent* extent, + unsigned int* flags, hipArray* array) { + return hipCUDAErrorTohipError(cudaArrayGetInfo(desc, extent, flags, array)); +} + +inline static hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor, + hipArray* array) { + return hipCUResultTohipError(cuArrayGetDescriptor(pArrayDescriptor, (CUarray)array)); +} + +inline static hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, + hipArray* array) { + return hipCUResultTohipError(cuArray3DGetDescriptor(pArrayDescriptor, (CUarray)array)); +} + inline static hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) { return hipCUDAErrorTohipError(cudaStreamBeginCapture(stream, mode)); } diff --git a/src/amd_hsa_elf.hpp b/src/amd_hsa_elf.hpp index 45cf5c22..ca22fd1f 100644 --- a/src/amd_hsa_elf.hpp +++ b/src/amd_hsa_elf.hpp @@ -22,6 +22,9 @@ THE SOFTWARE. #pragma once +// This header file is partially copied from +// https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/ELF.h + // AMDGPU OS for HSA compatible compute kernels. enum { ELFOSABI_AMDGPU_HSA = 64, ELFOSABI_AMDGPU_PAL = 65, ELFOSABI_AMDGPU_MESA3D = 66 }; @@ -57,38 +60,51 @@ enum : unsigned { EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600, EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS, - EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, - EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, - EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, - EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, - EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, - EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, - EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, - EF_AMDGPU_MACH_AMDGCN_RESERVED_0X027 = 0x027, - EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, - EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, - EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, - EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, - EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, - EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, - EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, - EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, - EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, - EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, - EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, - EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, - EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, - EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, - EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, - EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, - EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, - EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, - EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, - EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, - EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, - EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + // AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, + EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, + EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, + EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, + EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, + EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, + EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027, + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, + EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, + EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, + EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, + EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, + EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, + EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, + EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, + EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, + EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, + EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, + EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, + EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + + // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/src/amdhip.def b/src/amdhip.def index 9a1d3e6e..ffaff7f5 100644 --- a/src/amdhip.def +++ b/src/amdhip.def @@ -103,6 +103,9 @@ hipMemPoolImportPointer hipArrayCreate hipArray3DCreate hipArrayDestroy +hipArrayGetInfo +hipArrayGetDescriptor +hipArray3DGetDescriptor hipMallocArray hipMemAdvise hipMemAllocPitch @@ -190,6 +193,7 @@ hipStreamCreate hipStreamCreateWithFlags hipStreamCreateWithPriority hipStreamDestroy +hipStreamGetDevice hipStreamGetFlags hipStreamQuery hipStreamSynchronize diff --git a/src/hip_code_object.cpp b/src/hip_code_object.cpp index 0d5fb1b4..dd9de637 100644 --- a/src/hip_code_object.cpp +++ b/src/hip_code_object.cpp @@ -31,8 +31,6 @@ THE SOFTWARE. #include "platform/program.hpp" #include -hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false); hipError_t ihipFree(void* ptr); // forward declaration of methods required for managed variables hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0); @@ -172,6 +170,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx90c"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX940: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx940"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1010: xnackSupported = true; sramEccSupported = false; @@ -187,6 +190,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1012"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1013: + xnackSupported = true; + sramEccSupported = false; + proc_name = "gfx1013"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1030: xnackSupported = false; sramEccSupported = false; @@ -207,6 +215,41 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1033"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1034: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1034"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1035: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1035"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1036: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1036"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1100: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1100"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1101: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1101"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1102: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1102"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1103: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1103"; + break; default: return false; } @@ -590,10 +633,10 @@ hipError_t DynCO::initDynManagedVars(const std::string& managedVar) { it->second->setManagedVarInfo(pointer, dvar->size()); // copy initial value to the managed variable to the managed memory allocated - amd::HostQueue* queue = hip::getNullStream(); - if (queue != nullptr) { + hip::Stream* stream = hip::getNullStream(); + if (stream != nullptr) { status = ihipMemcpy(pointer, reinterpret_cast
(dvar->device_ptr()), dvar->size(), - hipMemcpyDeviceToDevice, *queue); + hipMemcpyDeviceToDevice, *stream); if (status != hipSuccess) { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status, managedVar.c_str()); @@ -613,7 +656,7 @@ hipError_t DynCO::initDynManagedVars(const std::string& managedVar) { } // copy managed memory pointer to the managed device variable status = ihipMemcpy(reinterpret_cast
(dvar->device_ptr()), &pointer, dvar->size(), - hipMemcpyHostToDevice, *queue); + hipMemcpyHostToDevice, *stream); if (status != hipSuccess) { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status, managedVar.c_str()); @@ -850,10 +893,10 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) { DeviceVar* dvar = nullptr; IHIP_RETURN_ONFAIL(var->getStatDeviceVar(&dvar, deviceId)); - amd::HostQueue* queue = g_devices.at(deviceId)->NullStream(); - if (queue != nullptr) { + hip::Stream* stream = g_devices.at(deviceId)->NullStream(); + if (stream != nullptr) { err = ihipMemcpy(reinterpret_cast
(dvar->device_ptr()), var->getManagedVarPtr(), - dvar->size(), hipMemcpyHostToDevice, *queue); + dvar->size(), hipMemcpyHostToDevice, *stream); } else { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL"); return hipErrorInvalidResourceHandle; diff --git a/src/hip_context.cpp b/src/hip_context.cpp index 824e6959..f639d4ff 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -29,7 +29,7 @@ std::vector g_devices; namespace hip { thread_local TlsAggregator tls; -Device* host_device = nullptr; +amd::Context* host_context = nullptr; //init() is only to be called from the HIP_INIT macro only once bool init() { @@ -44,7 +44,7 @@ bool init() { if (!amd::Runtime::init()) { return false; } - LogPrintfInfo("Direct Dispatch: %d", AMD_DIRECT_DISPATCH); + ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Direct Dispatch: %d", AMD_DIRECT_DISPATCH); const std::vector& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false); @@ -74,7 +74,7 @@ bool init() { if (CL_SUCCESS != hContext->create(nullptr)) { hContext->release(); } - host_device = new Device(hContext, -1); + host_context = hContext; PlatformState::instance().init(); return true; @@ -91,21 +91,21 @@ void setCurrentDevice(unsigned int index) { amd::Os::setPreferredNumaNode(preferredNumaNode); } -amd::HostQueue* getQueue(hipStream_t stream) { +hip::Stream* getStream(hipStream_t stream) { if (stream == nullptr) { return getNullStream(); } else { - amd::HostQueue* queue = reinterpret_cast(stream)->asHostQueue(); - if (!(reinterpret_cast(stream)->Flags() & hipStreamNonBlocking)) { + hip::Stream* hip_stream = reinterpret_cast(stream); + if (!(hip_stream->Flags() & hipStreamNonBlocking)) { constexpr bool WaitNullStreamOnly = true; - iHipWaitActiveStreams(queue, WaitNullStreamOnly); + iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly); } - return queue; + return hip_stream; } } // ================================================================================================ -amd::HostQueue* getNullStream(amd::Context& ctx) { +hip::Stream* getNullStream(amd::Context& ctx) { for (auto& it : g_devices) { if (it->asContext() == &ctx) { return it->NullStream(); @@ -113,7 +113,7 @@ amd::HostQueue* getNullStream(amd::Context& ctx) { } // If it's a pure SVM allocation with system memory access, then it shouldn't matter which device // runtime selects by default - if (hip::host_device->asContext() == &ctx) { + if (hip::host_context == &ctx) { // Return current... return getNullStream(); } @@ -131,7 +131,7 @@ int getDeviceID(amd::Context& ctx) { } // ================================================================================================ -amd::HostQueue* getNullStream() { +hip::Stream* getNullStream() { Device* device = getCurrentDevice(); return device ? device->NullStream() : nullptr; } diff --git a/src/hip_conversions.hpp b/src/hip_conversions.hpp index 757ccc79..ef928225 100644 --- a/src/hip_conversions.hpp +++ b/src/hip_conversions.hpp @@ -168,6 +168,12 @@ hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) { case CL_SIGNED_INT8: return HIP_AD_FORMAT_SIGNED_INT8; + case CL_UNSIGNED_INT16: + return HIP_AD_FORMAT_UNSIGNED_INT16; + + case CL_SIGNED_INT16: + return HIP_AD_FORMAT_SIGNED_INT16; + case CL_SIGNED_INT32: return HIP_AD_FORMAT_SIGNED_INT32; diff --git a/src/hip_device.cpp b/src/hip_device.cpp index fedd62e5..8782b6c3 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -26,25 +26,17 @@ namespace hip { // ================================================================================================ -amd::HostQueue* Device::NullStream(bool skip_alloc) { - amd::HostQueue* null_queue = null_stream_.asHostQueue(skip_alloc); - if (null_queue == nullptr) { - return nullptr; +hip::Stream* Device::NullStream(bool skip_alloc) { + if (null_stream_ == nullptr && !skip_alloc) { + null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } - // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_queue); - return null_queue; -} -// ================================================================================================ -Stream* Device::GetNullStream() { - amd::HostQueue* null_queue = null_stream_.asHostQueue(); - if (null_queue == nullptr) { + if (null_stream_ == nullptr) { return nullptr; } // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_queue); - return &null_stream_; + iHipWaitActiveStreams(null_stream_); + return null_stream_; } // ================================================================================================ @@ -54,6 +46,18 @@ bool Device::Create() { if (default_mem_pool_ == nullptr) { return false; } + + // Create graph memory pool + graph_mem_pool_ = new MemoryPool(this); + if (graph_mem_pool_ == nullptr) { + return false; + } + + uint64_t max_size = std::numeric_limits::max(); + // Use maximum value to hold memory, because current implementation doesn't support VM + // Note: the call for the threshold is always successful + auto error = graph_mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size); + // Current is default pool after device creation current_mem_pool_ = default_mem_pool_; return true; @@ -79,7 +83,7 @@ void Device::RemoveMemoryPool(MemoryPool* pool) { bool Device::FreeMemory(amd::Memory* memory, Stream* stream) { amd::ScopedLock lock(lock_); // Search for memory in the entire list of pools - for (auto& it : mem_pools_) { + for (auto it : mem_pools_) { if (it->FreeMemory(memory, stream)) { return true; } @@ -91,7 +95,7 @@ bool Device::FreeMemory(amd::Memory* memory, Stream* stream) { void Device::ReleaseFreedMemory(Stream* stream) { amd::ScopedLock lock(lock_); // Search for memory in the entire list of pools - for (auto& it : mem_pools_) { + for (auto it : mem_pools_) { it->ReleaseFreedMemory(stream); } } @@ -100,20 +104,23 @@ void Device::ReleaseFreedMemory(Stream* stream) { void Device::RemoveStreamFromPools(Stream* stream) { amd::ScopedLock lock(lock_); // Update all pools with the destroyed stream - for (auto& it : mem_pools_) { + for (auto it : mem_pools_) { it->RemoveStream(stream); } } // ================================================================================================ void Device::Reset() { - auto it = mem_pools_.begin(); - while (it != mem_pools_.end()) { - auto current = it++; - (*current)->ReleaseAllMemory(); - delete *current; + { + amd::ScopedLock lock(lock_); + auto it = mem_pools_.begin(); + while (it != mem_pools_.end()) { + auto current = it++; + (*current)->ReleaseAllMemory(); + delete *current; + } + mem_pools_.clear(); } - mem_pools_.clear(); flags_ = hipDeviceScheduleSpin; hip::Stream::destroyAllStreams(deviceId_); amd::MemObjMap::Purge(devices()[0]); @@ -125,6 +132,14 @@ Device::~Device() { if (default_mem_pool_ != nullptr) { default_mem_pool_->release(); } + + if (graph_mem_pool_ != nullptr) { + graph_mem_pool_->release(); + } + + if (null_stream_!= nullptr) { + null_stream_->release(); + } } } diff --git a/src/hip_device_runtime.cpp b/src/hip_device_runtime.cpp index def9bcb0..19bed5ef 100644 --- a/src/hip_device_runtime.cpp +++ b/src/hip_device_runtime.cpp @@ -512,9 +512,9 @@ hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) { hipError_t hipDeviceSynchronize ( void ) { HIP_INIT_API(hipDeviceSynchronize); - amd::HostQueue* queue = hip::getNullStream(); + hip::Stream* stream = hip::getNullStream(); - if (!queue) { + if (!stream) { HIP_RETURN(hipErrorOutOfMemory); } @@ -522,7 +522,7 @@ hipError_t hipDeviceSynchronize ( void ) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } - queue->finish(); + stream->finish(); hip::Stream::syncNonBlockingStreams(hip::getCurrentDevice()->deviceId()); @@ -602,7 +602,7 @@ hipError_t hipSetDeviceFlags ( unsigned int flags ) { switch (scheduleFlag) { case hipDeviceScheduleAuto: // Current behavior is different from the spec, due to MT usage in runtime - if (hip::host_device->devices().size() >= std::thread::hardware_concurrency()) { + if (hip::host_context->devices().size() >= std::thread::hardware_concurrency()) { device->SetActiveWait(false); break; } diff --git a/src/hip_event.cpp b/src/hip_event.cpp index f556cabe..747b69ce 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -177,12 +177,12 @@ int64_t EventDD::time(bool getStartTs) const { } } -hipError_t Event::streamWaitCommand(amd::Command*& command, amd::HostQueue* queue) { +hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream) { amd::Command::EventWaitList eventWaitList; if (event_ != nullptr) { eventWaitList.push_back(event_); } - command = new amd::Marker(*queue, kMarkerDisableFlush, eventWaitList); + command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList); if (command == NULL) { return hipErrorOutOfMemory; @@ -196,17 +196,17 @@ hipError_t Event::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* com } hipError_t Event::streamWait(hipStream_t stream, uint flags) { - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); // Access to event_ object must be lock protected amd::ScopedLock lock(lock_); - if ((event_ == nullptr) || (event_->command().queue() == queue) || ready()) { + if ((event_ == nullptr) || (event_->command().queue() == hip_stream) || ready()) { return hipSuccess; } if (!event_->notifyCmdQueue()) { return hipErrorLaunchOutOfResources; } amd::Command* command; - hipError_t status = streamWaitCommand(command, queue); + hipError_t status = streamWaitCommand(command, hip_stream); if (status != hipSuccess) { return status; } @@ -218,20 +218,19 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) { return hipSuccess; } -hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* queue, +hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags ) { if (command == nullptr) { int32_t releaseFlags = ((ext_flags == 0) ? flags : ext_flags) & - (hipEventReleaseToSystem | hipEventReleaseToDevice); - if (releaseFlags & hipEventReleaseToDevice) { - releaseFlags = amd::Device::kCacheStateAgent; - } else if (releaseFlags & hipEventReleaseToSystem) { - releaseFlags = amd::Device::kCacheStateSystem; - } else { + (hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventDisableSystemFence); + if (releaseFlags & hipEventDisableSystemFence) { releaseFlags = amd::Device::kCacheStateIgnore; + } else { + releaseFlags = amd::Device::kCacheStateInvalid; } // Always submit a EventMarker. - command = new hip::EventMarker(*queue, !kMarkerDisableFlush, true, releaseFlags); + command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags); } return hipSuccess; } @@ -249,10 +248,10 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command } hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) { - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847 amd::ScopedLock lock(lock_); - hipError_t status = recordCommand(command, queue); + hipError_t status = recordCommand(command, hip_stream); if (status != hipSuccess) { return hipSuccess; } @@ -279,14 +278,21 @@ bool isValid(hipEvent_t event) { // ================================================================================================ hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) { unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming | - hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess; + hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventInterprocess | hipEventDisableSystemFence; - const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); + const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem | + hipEventDisableSystemFence); // can't set any unsupported flags. - // can't set both release flags + // can set only one of the release flags. // if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set const bool illegalFlags = (flags & ~supportedFlags) || - ((flags & releaseFlags) == releaseFlags) || + ([](unsigned int num){ + unsigned int bitcount; + for (bitcount = 0; num; bitcount++) { + num &= num - 1; + } + return bitcount; } (flags & releaseFlags) > 1) || ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming)); if (!illegalFlags) { hip::Event* e = nullptr; @@ -379,8 +385,8 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) { return hipErrorInvalidHandle; } hip::Event* e = reinterpret_cast(event); - amd::HostQueue* queue = hip::getQueue(stream); - if (g_devices[e->deviceId()]->devices()[0] != &queue->device()) { + hip::Stream* hip_stream = hip::getStream(stream); + if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) { return hipErrorInvalidHandle; } return e->addMarker(stream, nullptr, true); @@ -404,9 +410,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) { HIP_RETURN(hipErrorInvalidHandle); } - if (hip::Stream::StreamCaptureOngoing() == true) { - HIP_RETURN(hipErrorStreamCaptureUnsupported); - } hip::Event* e = reinterpret_cast(event); HIP_RETURN(e->synchronize()); } diff --git a/src/hip_event.hpp b/src/hip_event.hpp index e08ea33f..91a8193d 100644 --- a/src/hip_event.hpp +++ b/src/hip_event.hpp @@ -78,9 +78,9 @@ typedef struct ihipIpcEventShmem_s { class EventMarker : public amd::Marker { public: - EventMarker(amd::HostQueue& queue, bool disableFlush, bool markerTs = false, + EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false, int32_t scope = amd::Device::kCacheStateInvalid) - : amd::Marker(queue, disableFlush) { + : amd::Marker(stream, disableFlush) { profilingInfo_.enabled_ = true; profilingInfo_.callback_ = nullptr; profilingInfo_.marker_ts_ = markerTs; @@ -116,11 +116,11 @@ class Event { virtual hipError_t synchronize(); hipError_t elapsedTime(Event& eStop, float& ms); - virtual hipError_t streamWaitCommand(amd::Command*& command, amd::HostQueue* queue); + virtual hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream); virtual hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command); virtual hipError_t streamWait(hipStream_t stream, uint flags); - virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, + virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags = 0); virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record); hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record); @@ -175,7 +175,7 @@ class Event { protected: amd::Monitor lock_; - amd::HostQueue* stream_; + hip::Stream* stream_; amd::Event* event_; int device_id_; //! Flag to indicate hipEventRecord has not been called. This is needed for @@ -224,7 +224,7 @@ class IPCEvent : public Event { hipError_t synchronize(); hipError_t query(); - hipError_t streamWaitCommand(amd::Command*& command, amd::HostQueue* queue); + hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream); hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command); hipError_t streamWait(hipStream_t stream, uint flags); diff --git a/src/hip_event_ipc.cpp b/src/hip_event_ipc.cpp index 7385566b..706b3d44 100644 --- a/src/hip_event_ipc.cpp +++ b/src/hip_event_ipc.cpp @@ -102,8 +102,8 @@ hipError_t IPCEvent::synchronize() { return hipSuccess; } -hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, amd::HostQueue* queue) { - command = new amd::Marker(*queue, false); +hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, hip::Stream* stream) { + command = new amd::Marker(*stream, false); if (command == NULL) { return hipErrorOutOfMemory; } @@ -125,12 +125,12 @@ hipError_t IPCEvent::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* } hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) { - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); amd::ScopedLock lock(lock_); if(query() != hipSuccess) { amd::Command* command; - hipError_t status = streamWaitCommand(command, queue); + hipError_t status = streamWaitCommand(command, hip_stream); if (status != hipSuccess) { return status; } @@ -140,18 +140,17 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) { return hipSuccess; } -hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags) { +hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) { bool unrecorded = isUnRecorded(); if (unrecorded) { - command = new amd::Marker(*queue, kMarkerDisableFlush); + command = new amd::Marker(*stream, kMarkerDisableFlush); } else { - return Event::recordCommand(command, queue); + return Event::recordCommand(command, stream); } return hipSuccess; } hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) { - amd::HostQueue* queue = hip::getQueue(stream); bool unrecorded = isUnRecorded(); if (unrecorded) { amd::Event& tEvent = command->event(); diff --git a/src/hip_fatbin.cpp b/src/hip_fatbin.cpp index bafc7436..c2b7ff75 100644 --- a/src/hip_fatbin.cpp +++ b/src/hip_fatbin.cpp @@ -23,13 +23,15 @@ FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) : fdesc_(amd: fname_ = std::string(); } - fatbin_dev_info_.resize(g_devices.size()); + fatbin_dev_info_.resize(g_devices.size(), nullptr); } FatBinaryInfo::~FatBinaryInfo() { - for (auto& fbd: fatbin_dev_info_) { - delete fbd; + for (auto* fbd: fatbin_dev_info_) { + if (fbd != nullptr) { + delete fbd; + } } if (fdesc_ > 0) { @@ -298,6 +300,10 @@ hipError_t FatBinaryInfo::AddDevProgram(const int device_id) { DeviceIdCheck(device_id); FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id]; + if (fbd_info == nullptr) { + return hipErrorInvalidKernelFile; + } + // If fat binary was already added, skip this step and return success if (fbd_info->add_dev_prog_ == false) { amd::Context* ctx = g_devices[device_id]->asContext(); diff --git a/src/hip_gl.cpp b/src/hip_gl.cpp index 216a2cb4..ce692753 100644 --- a/src/hip_gl.cpp +++ b/src/hip_gl.cpp @@ -637,13 +637,12 @@ hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources, HIP_RETURN(hipErrorUnknown); } - amd::HostQueue* queue = hip::getQueue(stream); - if (nullptr == queue) { + hip::Stream* hip_stream = hip::getStream(stream); + if (nullptr == hip_stream) { HIP_RETURN(hipErrorUnknown); } - amd::HostQueue& hostQueue = *queue; - if (!hostQueue.context().glenv() || !hostQueue.context().glenv()->isAssociated()) { + if (!hip_stream->context().glenv() || !hip_stream->context().glenv()->isAssociated()) { LogWarning("\"amdContext\" is not created from GL context or share list"); HIP_RETURN(hipErrorUnknown); } @@ -658,7 +657,7 @@ hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources, //! Now create command and enqueue amd::AcquireExtObjectsCommand* command = new amd::AcquireExtObjectsCommand( - hostQueue, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS); + *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS); if (command == nullptr) { HIP_RETURN(hipErrorUnknown); } @@ -712,13 +711,12 @@ hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources } // Wait for the current host queue - hip::getQueue(stream)->finish(); + hip::getStream(stream)->finish(); - amd::HostQueue* queue = hip::getQueue(stream); - if (nullptr == queue) { + hip::Stream* hip_stream = hip::getStream(stream); + if (nullptr == hip_stream) { HIP_RETURN(hipErrorUnknown); } - amd::HostQueue& hostQueue = *queue; std::vector memObjects; hipError_t err = hipSetInteropObjects(count, reinterpret_cast(resources), memObjects); @@ -730,7 +728,7 @@ hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources // Now create command and enqueue amd::ReleaseExtObjectsCommand* command = new amd::ReleaseExtObjectsCommand( - hostQueue, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS); + *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS); if (command == nullptr) { HIP_RETURN(hipErrorUnknown); } diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp index 4b5d9e15..fddc49b0 100644 --- a/src/hip_graph.cpp +++ b/src/hip_graph.cpp @@ -36,11 +36,16 @@ inline hipError_t ihipGraphAddNode(hipGraphNode_t graphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, bool capture = true) { graph->AddNode(graphNode); + std::unordered_set DuplicateDep; for (size_t i = 0; i < numDependencies; i++) { if ((!hipGraphNode::isNodeValid(pDependencies[i])) || (graph != pDependencies[i]->GetParentGraph())) { return hipErrorInvalidValue; } + if (DuplicateDep.find(pDependencies[i]) != DuplicateDep.end()) { + return hipErrorInvalidValue; + } + DuplicateDep.insert(pDependencies[i]); pDependencies[i]->AddEdge(graphNode); } if (capture == false) { @@ -1210,7 +1215,8 @@ hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t grap HIP_RETURN(status); } -hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph) { +hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, + uint64_t flags = 0) { if (pGraphExec == nullptr || graph == nullptr) { HIP_RETURN(hipErrorInvalidValue); } @@ -1227,7 +1233,8 @@ hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph) { clonedGraph->LevelOrder(levelOrder); clonedGraph->GetUserObjs(graphExeUserObj); *pGraphExec = - new hipGraphExec(levelOrder, parallelLists, nodeWaitLists, clonedNodes, graphExeUserObj); + new hipGraphExec(levelOrder, parallelLists, nodeWaitLists, clonedNodes, + graphExeUserObj, flags); if (*pGraphExec != nullptr) { return (*pGraphExec)->Init(); } else { @@ -1242,7 +1249,7 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, } hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph, - unsigned long long flags) { + unsigned long long flags = 0) { HIP_INIT_API(hipGraphInstantiateWithFlags, pGraphExec, graph, flags); if (pGraphExec == nullptr || graph == nullptr) { HIP_RETURN(hipErrorInvalidValue); @@ -1674,13 +1681,19 @@ hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t if (s->GetCaptureStatus() == hipStreamCaptureStatusNone) { HIP_RETURN(hipErrorIllegalState); } - if ((numDependencies > 0 && dependencies == nullptr) || + if ((s->GetCaptureGraph()->GetNodeCount() < numDependencies) || + (numDependencies > 0 && dependencies == nullptr) || (flags != 0 && flags != hipStreamAddCaptureDependencies && flags != hipStreamSetCaptureDependencies)) { HIP_RETURN(hipErrorInvalidValue); } std::vector depNodes; + const std::vector& graphNodes = s->GetCaptureGraph()->GetNodes(); for (int i = 0; i < numDependencies; i++) { + if ((dependencies[i] == nullptr) || + std::find(std::begin(graphNodes), std::end(graphNodes), dependencies[i]) == std::end(graphNodes)) { + HIP_RETURN(hipErrorInvalidValue); + } depNodes.push_back(dependencies[i]); } if (flags == hipStreamAddCaptureDependencies) { @@ -1700,7 +1713,7 @@ hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* fr } for (size_t i = 0; i < numDependencies; i++) { if (to[i]->GetParentGraph() != graph || from[i]->GetParentGraph() != graph || - from[i]->RemoveEdge(to[i]) == false) { + from[i]->RemoveUpdateEdge(to[i]) == false) { HIP_RETURN(hipErrorInvalidValue); } } @@ -2203,54 +2216,63 @@ hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType at if ((static_cast(device) >= g_devices.size()) || device < 0 || value == nullptr) { HIP_RETURN(hipErrorInvalidDevice); } - // later use this to access memory pool - auto* deviceHandle = g_devices[device]->devices()[0]; + hipError_t result = hipErrorInvalidValue; switch (attr) { case hipGraphMemAttrUsedMemCurrent: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrUsedMemCurrent, value); break; case hipGraphMemAttrUsedMemHigh: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrUsedMemHigh, value); break; case hipGraphMemAttrReservedMemCurrent: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrReservedMemCurrent, value); break; case hipGraphMemAttrReservedMemHigh: - *reinterpret_cast(value) = 0; + result = g_devices[device]->GetGraphMemoryPool()->GetAttribute( + hipMemPoolAttrReservedMemHigh, value); break; default: - return HIP_RETURN(hipErrorInvalidValue); + break; } - return HIP_RETURN(hipSuccess); + return HIP_RETURN(result); } +// ================================================================================================ hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value) { HIP_INIT_API(hipDeviceSetGraphMemAttribute, device, attr, value); if ((static_cast(device) >= g_devices.size()) || device < 0 || value == nullptr) { HIP_RETURN(hipErrorInvalidDevice); } - // later use this to access memory pool - auto* deviceHandle = g_devices[device]->devices()[0]; + hipError_t result = hipErrorInvalidValue; switch (attr) { case hipGraphMemAttrUsedMemHigh: + result = g_devices[device]->GetGraphMemoryPool()->SetAttribute( + hipMemPoolAttrUsedMemHigh, value); break; case hipGraphMemAttrReservedMemHigh: + result = g_devices[device]->GetGraphMemoryPool()->SetAttribute( + hipMemPoolAttrReservedMemHigh, value); break; default: - return HIP_RETURN(hipErrorInvalidValue); + break; } - return HIP_RETURN(hipSuccess); + return HIP_RETURN(result); } +// ================================================================================================ hipError_t hipDeviceGraphMemTrim(int device) { HIP_INIT_API(hipDeviceGraphMemTrim, device); if ((static_cast(device) >= g_devices.size()) || device < 0) { HIP_RETURN(hipErrorInvalidDevice); } - // not implemented yet + g_devices[device]->GetGraphMemoryPool()->TrimTo(0); return HIP_RETURN(hipSuccess); } +// ================================================================================================ hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy, unsigned int initialRefcount, unsigned int flags) { HIP_INIT_API(hipUserObjectCreate, object_out, ptr, destroy, initialRefcount, flags); @@ -2275,6 +2297,10 @@ hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count) { if (object->referenceCount() < count || !hipUserObject::isUserObjvalid(object)) { HIP_RETURN(hipSuccess); } + //! If all the counts are gone not longer need the obj in the list + if (object->referenceCount() == count) { + hipUserObject::removeUSerObj(object); + } object->decreaseRefCount(count); HIP_RETURN(hipSuccess); } diff --git a/src/hip_graph_helper.hpp b/src/hip_graph_helper.hpp index 69780338..20d01165 100644 --- a/src/hip_graph_helper.hpp +++ b/src/hip_graph_helper.hpp @@ -5,9 +5,9 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p); hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind); hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, amd::HostQueue& queue, bool isAsync = false); + hipMemcpyKind kind, hip::Stream& stream, bool isAsync = false); -void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, amd::HostQueue& queue); +void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream); bool IsHtoHMemcpy(void* dst, const void* src, hipMemcpyKind kind); @@ -26,19 +26,19 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, - amd::HostQueue* queue, void** kernelParams, void** extra, + hip::Stream* stream, void** kernelParams, void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, uint32_t params, uint32_t gridId, uint32_t numGrids, uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice); hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p, - amd::HostQueue* queue); + hip::Stream* stream); hipError_t ihipMemsetCommand(std::vector& commands, void* dst, int64_t value, - size_t valueSize, size_t sizeBytes, amd::HostQueue* queue); + size_t valueSize, size_t sizeBytes, hip::Stream* stream); hipError_t ihipMemset3DCommand(std::vector& commands, hipPitchedPtr pitchedDevPtr, - int value, hipExtent extent, amd::HostQueue* queue, size_t elementSize = 1); + int value, hipExtent extent, hip::Stream* stream, size_t elementSize = 1); hipError_t ihipMemcpySymbol_validate(const void* symbol, size_t sizeBytes, size_t offset, size_t& sym_size, hipDeviceptr_t& device_ptr); diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp index 3ce74cab..f4060a0f 100644 --- a/src/hip_graph_internal.cpp +++ b/src/hip_graph_internal.cpp @@ -77,14 +77,14 @@ hipError_t hipGraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } - if (kind != hipMemcpyHostToDevice) { + if ((kind != hipMemcpyHostToDevice) && (kind != hipMemcpyDefault)) { return hipErrorInvalidValue; } } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { // device to host if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { return hipErrorInvalidValue; } - if (kind != hipMemcpyDeviceToHost) { + if ((kind != hipMemcpyDeviceToHost) && (kind != hipMemcpyDefault)) { return hipErrorInvalidValue; } } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { @@ -98,56 +98,6 @@ hipError_t hipGraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size return hipSuccess; } -hipError_t hipGraphMemcpyNode1D::SetCommandParams(void* dst, const void* src, size_t count, - hipMemcpyKind kind) { - hipError_t status = ihipMemcpy_validate(dst, src, count, kind); - if (status != hipSuccess) { - return status; - } - size_t sOffsetOrig = 0; - amd::Memory* origSrcMemory = getMemoryObject(src, sOffsetOrig); - size_t dOffsetOrig = 0; - amd::Memory* origDstMemory = getMemoryObject(dst, dOffsetOrig); - - size_t sOffset = 0; - amd::Memory* srcMemory = getMemoryObject(src, sOffset); - size_t dOffset = 0; - amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - - if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - amd::WriteMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*dstMemory->asBuffer(), dOffset, count, src); - } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - amd::ReadMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory->asBuffer(), sOffset, count, dst); - } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { - if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) { - return hipErrorInvalidValue; - } - amd::CopyMemoryP2PCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, count); - // Make sure runtime has valid memory for the command execution. P2P access - // requires page table mapping on the current device to another GPU memory - if (!static_cast(command)->validateMemory()) { - delete command; - return hipErrorInvalidValue; - } - } else { - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, count); - } - return hipSuccess; -} - hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) { hipError_t status = ihipMemcpy3D_validate(pNodeParams); if (status != hipSuccess) { @@ -297,185 +247,6 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam return hipSuccess; } -hipError_t hipGraphMemcpyNode::SetCommandParams(const hipMemcpy3DParms* pNodeParams) { - hipError_t status = ihipMemcpy3D_validate(pNodeParams); - if (status != hipSuccess) { - return status; - } - const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pNodeParams); - // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the - // (unified virtual address space) base address of the source data and the bytes per row to apply. - // {src/dst}Array is ignored. - hipMemoryType srcMemoryType = pCopy.srcMemoryType; - if (srcMemoryType == hipMemoryTypeUnified) { - srcMemoryType = - amd::MemObjMap::FindMemObj(pCopy.srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { - // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system - // memory. - const_cast(&pCopy)->srcHost = pCopy.srcDevice; - } - } - hipMemoryType dstMemoryType = pCopy.dstMemoryType; - if (dstMemoryType == hipMemoryTypeUnified) { - dstMemoryType = - amd::MemObjMap::FindMemObj(pCopy.dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeHost) { - const_cast(&pCopy)->dstHost = pCopy.dstDevice; - } - } - - // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned. - // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning. - if (srcMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy.srcHost); - srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (srcMemoryType == hipMemoryTypeDevice) { - const_cast(&pCopy)->srcDevice = const_cast(pCopy.srcHost); - } - } - if (dstMemoryType == hipMemoryTypeHost) { - amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy.dstHost); - dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost; - if (dstMemoryType == hipMemoryTypeDevice) { - const_cast(&pCopy)->dstDevice = const_cast(pCopy.dstDevice); - } - } - - amd::Coord3D srcOrigin = {pCopy.srcXInBytes, pCopy.srcY, pCopy.srcZ}; - amd::Coord3D dstOrigin = {pCopy.dstXInBytes, pCopy.dstY, pCopy.dstZ}; - amd::Coord3D copyRegion = {pCopy.WidthInBytes, pCopy.Height, pCopy.Depth}; - - if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeDevice)) { - // Host to Device. - - amd::Memory* dstMemory; - amd::BufferRect srcRect; - amd::BufferRect dstRect; - - status = - ihipMemcpyHtoDValidate(pCopy.srcHost, pCopy.dstDevice, srcOrigin, dstOrigin, copyRegion, - pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch, - pCopy.dstPitch * pCopy.dstHeight, dstMemory, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::WriteMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*dstMemory, {dstRect.start_, 0, 0}, copyRegion, pCopy.srcHost, dstRect, - srcRect); - } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) { - // Device to Host. - amd::Memory* srcMemory; - amd::BufferRect srcRect; - amd::BufferRect dstRect; - status = - ihipMemcpyDtoHValidate(pCopy.srcDevice, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion, - pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch, - pCopy.dstPitch * pCopy.dstHeight, srcMemory, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::ReadMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory, {srcRect.start_, 0, 0}, copyRegion, pCopy.dstHost, srcRect, - dstRect); - command->setSource(*srcMemory); - command->setOrigin({srcRect.start_, 0, 0}); - command->setSize(copyRegion); - command->setDestination(pCopy.dstHost); - command->setBufRect(srcRect); - command->setHostRect(dstRect); - } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) { - // Device to Device. - amd::Memory* srcMemory; - amd::Memory* dstMemory; - amd::BufferRect srcRect; - amd::BufferRect dstRect; - - status = ihipMemcpyDtoDValidate(pCopy.srcDevice, pCopy.dstDevice, srcOrigin, dstOrigin, - copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, - pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcMemory, - dstMemory, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory, *dstMemory, {srcRect.start_, 0, 0}, {dstRect.start_, 0, 0}, - copyRegion, srcRect, dstRect); - } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) { - amd::Image* dstImage; - amd::BufferRect srcRect; - - status = - ihipMemcpyHtoAValidate(pCopy.srcHost, pCopy.dstArray, srcOrigin, dstOrigin, copyRegion, - pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, dstImage, srcRect); - if (status != hipSuccess) { - return status; - } - amd::WriteMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*dstImage, dstOrigin, copyRegion, - static_cast(pCopy.srcHost) + srcRect.start_, pCopy.srcPitch, - pCopy.srcPitch * pCopy.srcHeight); - } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) { - // Image to Host. - amd::Image* srcImage; - amd::BufferRect dstRect; - - status = - ihipMemcpyAtoHValidate(pCopy.srcArray, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion, - pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcImage, dstRect); - if (status != hipSuccess) { - return status; - } - amd::ReadMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcImage, srcOrigin, copyRegion, - static_cast(pCopy.dstHost) + dstRect.start_, pCopy.dstPitch, - pCopy.dstPitch * pCopy.dstHeight); - } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) { - // Device to Image. - amd::Image* dstImage; - amd::Memory* srcMemory; - amd::BufferRect dstRect; - amd::BufferRect srcRect; - status = ihipMemcpyDtoAValidate(pCopy.srcDevice, pCopy.dstArray, srcOrigin, dstOrigin, - copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, - dstImage, srcMemory, dstRect, srcRect); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcMemory, *dstImage, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); - } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) { - // Image to Device. - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Memory* dstMemory; - amd::Image* srcImage; - status = ihipMemcpyAtoDValidate(pCopy.srcArray, pCopy.dstDevice, srcOrigin, dstOrigin, - copyRegion, pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, - dstMemory, srcImage, srcRect, dstRect); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); - } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) { - amd::Image* srcImage; - amd::Image* dstImage; - - status = ihipMemcpyAtoAValidate(pCopy.srcArray, pCopy.dstArray, srcOrigin, dstOrigin, - copyRegion, srcImage, dstImage); - if (status != hipSuccess) { - return status; - } - amd::CopyMemoryCommand* command = reinterpret_cast(commands_[0]); - command->setParams(*srcImage, *dstImage, srcOrigin, dstOrigin, copyRegion); - } else { - return hipErrorInvalidValue; - } - return hipSuccess; -} - - bool ihipGraph::isGraphValid(ihipGraph* pGraph) { amd::ScopedLock lock(graphSetLock_); if (graphSet_.find(pGraph) == graphSet_.end()) { @@ -679,41 +450,38 @@ bool hipGraphExec::isGraphExecValid(hipGraphExec* pGraphExec) { return true; } -hipError_t hipGraphExec::CreateQueues(size_t numQueues) { - parallelQueues_.reserve(numQueues); - for (size_t i = 0; i < numQueues; i++) { - amd::HostQueue* queue; - queue = new amd::HostQueue( - *hip::getCurrentDevice()->asContext(), *hip::getCurrentDevice()->devices()[0], 0, - amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal); - - bool result = (queue != nullptr) ? queue->create() : false; - // Create a host queue - if (result) { - parallelQueues_.push_back(queue); - } else { - ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create host queue\n"); +hipError_t hipGraphExec::CreateStreams(uint32_t num_streams) { + parallel_streams_.reserve(num_streams); + for (uint32_t i = 0; i < num_streams; ++i) { + auto stream = new hip::Stream(hip::getCurrentDevice(), + hip::Stream::Priority::Normal, hipStreamNonBlocking); + if (stream == nullptr || !stream->Create()) { + if (stream != nullptr) { + stream->release(); + } + ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n"); return hipErrorOutOfMemory; } + parallel_streams_.push_back(stream); } return hipSuccess; } hipError_t hipGraphExec::Init() { hipError_t status; - size_t reqNumQueues = 1; + size_t min_num_streams = 1; for (auto& node : levelOrder_) { - reqNumQueues += node->GetNumParallelQueues(); + min_num_streams += node->GetNumParallelStreams(); } - status = CreateQueues(parallelLists_.size() - 1 + reqNumQueues); + status = CreateStreams(parallelLists_.size() - 1 + min_num_streams); return status; } hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& levelOrder, std::vector& rootCommands, - amd::Command*& endCommand, amd::HostQueue* queue) { + amd::Command*& endCommand, hip::Stream* stream) { hipError_t status; for (auto& node : levelOrder) { // TODO: clone commands from next launch @@ -763,7 +531,7 @@ hipError_t FillCommands(std::vector>& parallelLists, } } if (!graphLastCmdWaitList.empty()) { - endCommand = new amd::Marker(*queue, false, graphLastCmdWaitList); + endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList); if (endCommand == nullptr) { return hipErrorOutOfMemory; } @@ -771,19 +539,19 @@ hipError_t FillCommands(std::vector>& parallelLists, return hipSuccess; } -void UpdateQueue(std::vector>& parallelLists, amd::HostQueue*& queue, +void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, hipGraphExec* ptr) { int i = 0; for (const auto& list : parallelLists) { // first parallel list will be launched on the same queue as parent if (i == 0) { for (auto& node : list) { - node->SetQueue(queue, ptr); + node->SetStream(stream, ptr); } - } else { // New queue for parallel branches - amd::HostQueue* paralleQueue = ptr->GetAvailableQueue(); + } else { // New stream for parallel branches + hip::Stream* stream = ptr->GetAvailableStreams(); for (auto& node : list) { - node->SetQueue(paralleQueue, ptr); + node->SetStream(stream, ptr); } } i++; @@ -792,15 +560,22 @@ void UpdateQueue(std::vector>& parallelLists, amd::HostQueue*& hipError_t hipGraphExec::Run(hipStream_t stream) { hipError_t status; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + + if (hip::getStream(stream) == nullptr) { return hipErrorInvalidResourceHandle; } - UpdateQueue(parallelLists_, queue, this); + if (flags_ == hipGraphInstantiateFlagAutoFreeOnLaunch) { + if (!levelOrder_.empty()) { + levelOrder_[0]->GetParentGraph()->FreeAllMemory(); + } + } + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() + : reinterpret_cast(stream); + UpdateStream(parallelLists_, hip_stream, this); std::vector rootCommands; amd::Command* endCommand = nullptr; status = - FillCommands(parallelLists_, nodeWaitLists_, levelOrder_, rootCommands, endCommand, queue); + FillCommands(parallelLists_, nodeWaitLists_, levelOrder_, rootCommands, endCommand, hip_stream); if (status != hipSuccess) { return status; } diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp index 6462535d..819125c1 100644 --- a/src/hip_graph_internal.hpp +++ b/src/hip_graph_internal.hpp @@ -38,9 +38,9 @@ typedef hipGraphNode* Node; hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& levelOrder, std::vector& rootCommands, - amd::Command*& endCommand, amd::HostQueue* queue); -void UpdateQueue(std::vector>& parallelLists, amd::HostQueue*& queue, - hipGraphExec* ptr); + amd::Command*& endCommand, hip::Stream* stream); +void UpdateStream(std::vector>& parallelLists, hip::Stream* stream, + hipGraphExec* ptr); struct hipUserObject : public amd::ReferenceCountedObject { typedef void (*UserCallbackDestructor)(void* data); @@ -76,8 +76,8 @@ struct hipUserObject : public amd::ReferenceCountedObject { } static bool isUserObjvalid(hipUserObject* pUsertObj) { - amd::ScopedLock lock(UserObjectLock_); - if (ObjectSet_.find(pUsertObj) == ObjectSet_.end()) { + auto it = ObjectSet_.find(pUsertObj); + if (it == ObjectSet_.end()) { return false; } return true; @@ -85,8 +85,9 @@ struct hipUserObject : public amd::ReferenceCountedObject { static void removeUSerObj(hipUserObject* pUsertObj) { amd::ScopedLock lock(UserObjectLock_); - if (ObjectSet_.find(pUsertObj) == ObjectSet_.end()) { - ObjectSet_.erase(pUsertObj); + auto it = ObjectSet_.find(pUsertObj); + if (it != ObjectSet_.end()) { + ObjectSet_.erase(it); } } @@ -154,7 +155,7 @@ struct hipGraphNodeDOTAttribute { struct hipGraphNode : public hipGraphNodeDOTAttribute { protected: - amd::HostQueue* queue_; + hip::Stream* stream_ = nullptr; uint32_t level_; unsigned int id_; hipGraphNodeType type_; @@ -221,13 +222,15 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { return true; } - amd::HostQueue* GetQueue() { return queue_; } + hip::Stream* GetQueue() { return stream_; } - virtual void SetQueue(amd::HostQueue* queue, hipGraphExec* ptr = nullptr) { queue_ = queue; } + virtual void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) { + stream_ = stream; + } /// Create amd::command for the graph node - virtual hipError_t CreateCommand(amd::HostQueue* queue) { + virtual hipError_t CreateCommand(hip::Stream* stream) { commands_.clear(); - queue_ = queue; + stream_ = stream; return hipSuccess; } /// Return node unique ID @@ -265,6 +268,9 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { dependencies_.erase(std::remove(dependencies_.begin(), dependencies_.end(), node), dependencies_.end()); } + void RemoveEdge(const Node& childNode) { + edges_.erase(std::remove(edges_.begin(), edges_.end(), childNode), edges_.end()); + } /// Return graph node children const std::vector& GetEdges() const { return edges_; } /// Updates graph node children @@ -280,6 +286,12 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { edge->UpdateEdgeLevel(); } } + void ReduceEdgeLevel() { + for (auto edge: edges_) { + edge->SetLevel(std::min(edge->GetLevel(),GetLevel() + 1)); + edge->ReduceEdgeLevel(); + } + } /// Add edge, update parent node outdegree, child node indegree, level and dependency void AddEdge(const Node& childNode) { edges_.push_back(childNode); @@ -290,7 +302,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { childNode->AddDependency(this); } /// Remove edge, update parent node outdegree, child node indegree, level and dependency - bool RemoveEdge(const Node& childNode) { + bool RemoveUpdateEdge(const Node& childNode) { // std::remove changes the end() hence saving it before hand for validation auto currEdgeEnd = edges_.end(); auto it = std::remove(edges_.begin(), edges_.end(), childNode); @@ -301,15 +313,20 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { edges_.erase(it, edges_.end()); outDegree_--; childNode->SetInDegree(childNode->GetInDegree() - 1); + childNode->RemoveDependency(this); const std::vector& dependencies = childNode->GetDependencies(); int32_t level = 0; int32_t parentLevel = 0; + uint32_t origLevel = 0; for (auto parent : dependencies) { parentLevel = parent->GetLevel(); level = std::max(level, (parentLevel + 1)); } + origLevel = childNode->GetLevel(); childNode->SetLevel(level); - childNode->RemoveDependency(this); + if (level < origLevel) { + childNode->ReduceEdgeLevel(); + } return true; } /// Get Runlist of the nodes embedded as part of the graphnode(e.g. ChildGraph) @@ -323,7 +340,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { command->updateEventWaitList(waitList); } } - virtual size_t GetNumParallelQueues() { return 0; } + virtual size_t GetNumParallelStreams() { return 0; } /// Enqueue commands part of the node virtual void EnqueueCommands(hipStream_t stream) { // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker. @@ -332,8 +349,8 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy || type_ == hipGraphNodeTypeMemset)) { amd::Command::EventWaitList waitList; - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); return; @@ -365,7 +382,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute { fout << "\"" << fromNodeName << "\" -> \"" << toNodeName << "\"" << std::endl; } } - virtual std::string GetLabel() { return (std::to_string(id_) + "\n" + label_); } + virtual std::string GetLabel(hipGraphDebugDotFlags flag) { return (std::to_string(id_) + "\n" + label_); } unsigned int GetEnabled() const { return isEnabled_; } void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; } }; @@ -389,17 +406,9 @@ struct ihipGraph { , device_(device) { amd::ScopedLock lock(graphSetLock_); graphSet_.insert(this); - if (original == nullptr) { - // Create memory pool, associated with the graph - mem_pool_ = new hip::MemoryPool(device); - uint64_t max_size = std::numeric_limits::max(); - // Note: the call for the threshold is always successful - auto error = mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size); - } else { - mem_pool_ = original->mem_pool_; - mem_pool_->retain(); - } - }; + mem_pool_ = device->GetGraphMemoryPool(); + mem_pool_->retain(); + } ~ihipGraph() { for (auto node : vertices_) { @@ -414,7 +423,7 @@ struct ihipGraph { mem_pool_->release(); } - }; + } void AddManualNodeDuringCapture(hipGraphNode* node) { capturedNodes_.insert(node); } @@ -478,7 +487,7 @@ struct ihipGraph { ihipGraph* clone() const; void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) { fout << "subgraph cluster_" << GetID() << " {" << std::endl; - fout << "graph[style=\"dashed\" label=\"graph_" << GetID() << "\"];\n"; + fout << "label=\"graph_" << GetID() <<"\"graph[style=\"dashed\"];\n"; for (auto node : vertices_) { node->GenerateDOTNode(GetID(), fout, flag); } @@ -516,6 +525,10 @@ struct ihipGraph { } return false; } + + void FreeAllMemory() { + mem_pool_->FreeAllMemory(); + } }; struct hipGraphExec { @@ -523,26 +536,28 @@ struct hipGraphExec { // level order of the graph doesn't include nodes embedded as part of the child graph std::vector levelOrder_; std::unordered_map> nodeWaitLists_; - std::vector parallelQueues_; + std::vector parallel_streams_; uint currentQueueIndex_; std::unordered_map clonedNodes_; amd::Command* lastEnqueuedCommand_; static std::unordered_set graphExecSet_; std::unordered_set graphExeUserObj_; static amd::Monitor graphExecSetLock_; - + uint64_t flags_ = 0; public: hipGraphExec(std::vector& levelOrder, std::vector>& lists, std::unordered_map>& nodeWaitLists, std::unordered_map& clonedNodes, - std::unordered_set& userObjs) + std::unordered_set& userObjs, + uint64_t flags = 0) : parallelLists_(lists), levelOrder_(levelOrder), nodeWaitLists_(nodeWaitLists), clonedNodes_(clonedNodes), lastEnqueuedCommand_(nullptr), graphExeUserObj_(userObjs), - currentQueueIndex_(0) { + currentQueueIndex_(0), + flags_(flags) { amd::ScopedLock lock(graphExecSetLock_); graphExecSet_.insert(this); } @@ -550,8 +565,10 @@ struct hipGraphExec { ~hipGraphExec() { // new commands are launched for every launch they are destroyed as and when command is // terminated after it complete execution - for (auto queue : parallelQueues_) { - queue->release(); + for (auto stream : parallel_streams_) { + if (stream != nullptr) { + stream->release(); + } } for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second; amd::ScopedLock lock(graphExecSetLock_); @@ -576,10 +593,10 @@ struct hipGraphExec { std::vector& GetNodes() { return levelOrder_; } - amd::HostQueue* GetAvailableQueue() { return parallelQueues_[currentQueueIndex_++]; } + hip::Stream* GetAvailableStreams() { return parallel_streams_[currentQueueIndex_++]; } void ResetQueueIndex() { currentQueueIndex_ = 0; } hipError_t Init(); - hipError_t CreateQueues(size_t numQueues); + hipError_t CreateStreams(uint32_t num_streams); hipError_t Run(hipStream_t stream); }; @@ -608,20 +625,20 @@ struct hipChildGraphNode : public hipGraphNode { ihipGraph* GetChildGraph() { return childGraph_; } - size_t GetNumParallelQueues() { + size_t GetNumParallelStreams() { LevelOrder(childGraphlevelOrder_); size_t num = 0; for (auto& node : childGraphlevelOrder_) { - num += node->GetNumParallelQueues(); + num += node->GetNumParallelStreams(); } // returns total number of parallel queues required for child graph nodes to be launched // first parallel list will be launched on the same queue as parent return num + (parallelLists_.size() - 1); } - void SetQueue(amd::HostQueue* queue, hipGraphExec* ptr = nullptr) { - queue_ = queue; - UpdateQueue(parallelLists_, queue, ptr); + void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) { + stream_ = stream; + UpdateStream(parallelLists_, stream, ptr); } // For nodes that are dependent on the child graph node waitlist is the last node of the first @@ -629,8 +646,8 @@ struct hipChildGraphNode : public hipGraphNode { std::vector& GetCommands() { return parallelLists_[0].back()->GetCommands(); } // Create child graph node commands and set waitlists - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -638,7 +655,7 @@ struct hipChildGraphNode : public hipGraphNode { std::vector rootCommands; amd::Command* endCommand = nullptr; status = FillCommands(parallelLists_, nodeWaitLists_, childGraphlevelOrder_, rootCommands, - endCommand, queue); + endCommand, stream); for (auto& cmd : rootCommands) { commands_.push_back(cmd); } @@ -710,12 +727,31 @@ class hipGraphKernelNode : public hipGraphNode { unsigned int kernelAttrInUse_; public: + void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) { + out << "["; + out << "style"; + out << "=\""; + out << style_; + (flag == hipGraphDebugDotFlagsKernelNodeParams || + flag == hipGraphDebugDotFlagsKernelNodeAttributes) ? + out << "\n" : out << "\""; + out << "shape"; + out << "=\""; + out << GetShape(flag); + out << "\""; + out << "label"; + out << "=\""; + out << GetLabel(flag); + out << "\""; + out << "];"; + } + std::string GetLabel(hipGraphDebugDotFlags flag) { hipFunction_t func = getFunc(*pKernelParams_, ihipGetDevice()); hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func); std::string label; - if (flag == hipGraphDebugDotFlagsKernelNodeParams || flag == hipGraphDebugDotFlagsVerbose) { - char buffer[500]; + char buffer[500]; + if (flag == hipGraphDebugDotFlagsVerbose) { sprintf(buffer, "{\n%s\n| {ID | %d | %s\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>}\n| {{node " "handle | func handle} | {%p | %p}}\n| {accessPolicyWindow | {base_ptr | num_bytes | " @@ -729,8 +765,29 @@ class hipGraphKernelNode : public hipGraphNode { kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp, kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative); label = buffer; - } else { - label = std::to_string(GetID()) + "\n" + function->name(); + } + else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) { + sprintf(buffer, + "{\n%s\n| {ID | %d | %s}\n" + "| {accessPolicyWindow | {base_ptr | num_bytes | " + "hitRatio | hitProp | missProp} |\n| {%p | %ld | %f | %d | %d}}\n| {cooperative | " + "%u}\n| {priority | 0}\n}", + label_.c_str(), GetID(), function->name().c_str(), + kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes, + kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp, + kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative); + label = buffer; + } + else if (flag == hipGraphDebugDotFlagsKernelNodeParams) { + sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>", + GetID(), function->name().c_str(), pKernelParams_->gridDim.x, + pKernelParams_->gridDim.y, pKernelParams_->gridDim.z, + pKernelParams_->blockDim.x, pKernelParams_->blockDim.y, + pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes); + label = buffer; + } + else { + label = std::to_string(GetID()) + "\n" + function->name() + "\n"; } return label; } @@ -868,14 +925,14 @@ class hipGraphKernelNode : public hipGraphNode { return new hipGraphKernelNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { + hipError_t CreateCommand(hip::Stream* stream) { hipFunction_t func = nullptr; hipError_t status = validateKernelParams(pKernelParams_, &func, - queue ? hip::getDeviceID(queue->context()) : -1); + stream ? hip::getDeviceID(stream->context()) : -1); if (hipSuccess != status) { return status; } - status = hipGraphNode::CreateCommand(queue); + status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -886,7 +943,7 @@ class hipGraphKernelNode : public hipGraphNode { pKernelParams_->gridDim.y * pKernelParams_->blockDim.y, pKernelParams_->gridDim.z * pKernelParams_->blockDim.z, pKernelParams_->blockDim.x, pKernelParams_->blockDim.y, pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes, - queue, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0, + stream, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0, 0, 0); commands_.emplace_back(command); return status; @@ -942,7 +999,7 @@ class hipGraphKernelNode : public hipGraphNode { } hipError_t GetAttrParams(hipKernelNodeAttrID attr, hipKernelNodeAttrValue* params) { // Get kernel attr params - if (kernelAttrInUse_ != attr) return hipErrorInvalidValue; + if (kernelAttrInUse_ != 0 && kernelAttrInUse_ != attr) return hipErrorInvalidValue; if (attr == hipKernelNodeAttributeAccessPolicyWindow) { params->accessPolicyWindow.base_ptr = kernelAttr_.accessPolicyWindow.base_ptr; params->accessPolicyWindow.hitProp = kernelAttr_.accessPolicyWindow.hitProp; @@ -979,22 +1036,6 @@ class hipGraphKernelNode : public hipGraphNode { } return hipSuccess; } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const hipKernelNodeParams* params) { - // updates kernel params - hipError_t status = validateKernelParams(params); - if (hipSuccess != status) { - return status; - } - size_t globalWorkOffset[3] = {0}; - size_t globalWorkSize[3] = {params->gridDim.x, params->gridDim.y, params->gridDim.z}; - size_t localWorkSize[3] = {params->blockDim.x, params->blockDim.y, params->blockDim.z}; - reinterpret_cast(commands_[0]) - ->setSizes(globalWorkOffset, globalWorkSize, localWorkSize); - reinterpret_cast(commands_[0]) - ->setSharedMemBytes(params->sharedMemBytes); - return hipSuccess; - } hipError_t SetParams(hipGraphNode* node) { const hipGraphKernelNode* kernelNode = static_cast(node); @@ -1045,17 +1086,17 @@ class hipGraphMemcpyNode : public hipGraphNode { return new hipGraphMemcpyNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { + hipError_t CreateCommand(hip::Stream* stream) { if (IsHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->kind)) { return hipSuccess; } - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } commands_.reserve(1); amd::Command* command; - status = ihipMemcpy3DCommand(command, pCopyParams_, queue); + status = ihipMemcpy3DCommand(command, pCopyParams_, stream); commands_.emplace_back(command); return status; } @@ -1064,7 +1105,7 @@ class hipGraphMemcpyNode : public hipGraphNode { if (isEnabled_ && IsHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->kind)) { ihipHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->extent.width * pCopyParams_->extent.height * - pCopyParams_->extent.depth, *hip::getQueue(stream)); + pCopyParams_->extent.depth, *hip::getStream(stream)); return; } hipGraphNode::EnqueueCommands(stream); @@ -1085,8 +1126,6 @@ class hipGraphMemcpyNode : public hipGraphNode { const hipGraphMemcpyNode* memcpyNode = static_cast(node); return SetParams(memcpyNode->pCopyParams_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const hipMemcpy3DParms* pNodeParams); hipError_t ValidateParams(const hipMemcpy3DParms* pNodeParams); std::string GetLabel(hipGraphDebugDotFlags flag) { const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pCopyParams_); @@ -1191,17 +1230,17 @@ class hipGraphMemcpyNode1D : public hipGraphNode { return new hipGraphMemcpyNode1D(static_cast(*this)); } - virtual hipError_t CreateCommand(amd::HostQueue* queue) { + virtual hipError_t CreateCommand(hip::Stream* stream) { if (IsHtoHMemcpy(dst_, src_, kind_)) { return hipSuccess; } - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } commands_.reserve(1); amd::Command* command = nullptr; - status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *queue); + status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream); commands_.emplace_back(command); return status; } @@ -1216,14 +1255,14 @@ class hipGraphMemcpyNode1D : public hipGraphNode { if (isEnabled_) { //HtoH if (isH2H) { - ihipHtoHMemcpy(dst_, src_, count_, *hip::getQueue(stream)); + ihipHtoHMemcpy(dst_, src_, count_, *hip::getStream(stream)); return; } amd::Command* command = commands_[0]; amd::HostQueue* cmdQueue = command->queue(); - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); - if (cmdQueue == queue) { + if (cmdQueue == hip_stream) { command->enqueue(); command->release(); return; @@ -1231,7 +1270,7 @@ class hipGraphMemcpyNode1D : public hipGraphNode { amd::Command::EventWaitList waitList; amd::Command* depdentMarker = nullptr; - amd::Command* cmd = queue->getLastQueuedCommand(true); + amd::Command* cmd = hip_stream->getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); amd::Command* depdentMarker = new amd::Marker(*cmdQueue, true, waitList); @@ -1248,7 +1287,7 @@ class hipGraphMemcpyNode1D : public hipGraphNode { if (cmd != nullptr) { waitList.clear(); waitList.push_back(cmd); - amd::Command* depdentMarker = new amd::Marker(*queue, true, waitList); + amd::Command* depdentMarker = new amd::Marker(*hip_stream, true, waitList); if (depdentMarker != nullptr) { depdentMarker->enqueue(); // Make sure future commands of queue synced with command depdentMarker->release(); @@ -1257,8 +1296,8 @@ class hipGraphMemcpyNode1D : public hipGraphNode { } } else { amd::Command::EventWaitList waitList; - amd::HostQueue* queue = hip::getQueue(stream); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList); command->enqueue(); command->release(); } @@ -1281,8 +1320,6 @@ class hipGraphMemcpyNode1D : public hipGraphNode { return SetParams(memcpy1DNode->dst_, memcpy1DNode->src_, memcpy1DNode->count_, memcpy1DNode->kind_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(void* dst, const void* src, size_t count, hipMemcpyKind kind); static hipError_t ValidateParams(void* dst, const void* src, size_t count, hipMemcpyKind kind); std::string GetLabel(hipGraphDebugDotFlags flag) { size_t sOffsetOrig = 0; @@ -1306,9 +1343,15 @@ class hipGraphMemcpyNode1D : public hipGraphNode { if (flag == hipGraphDebugDotFlagsMemcpyNodeParams || flag == hipGraphDebugDotFlagsVerbose) { char buffer[500]; sprintf(buffer, - "{\n%s\n| {{ID | node handle | dst | src | count | kind } | {%u | %p | %p | %p | " - "%zu | %s}}}", - label_.c_str(), GetID(), this, dst_, src_, count_, memcpyDirection.c_str()); + "{\n%s\n| {{ID | node handle} | {%u | %p}}\n| {kind | %s}\n| {{srcPtr | dstPtr} | " + "{pitch " + "| ptr | xsize | ysize | pitch | ptr | xsize | size} | {%zu | %p | %zu | %zu | %zu | %p " + "| %zu " + "| %zu}}\n| {{srcPos | {{x | %zu} | {y | %zu} | {z | %zu}}} | {dstPos | {{x | %zu} | {y " + "| " + "%zu} | {z | %zu}}} | {Extent | {{Width | %zu} | {Height | %zu} | {Depth | %zu}}}}\n}", + label_.c_str(), GetID(), this, memcpyDirection.c_str(), (size_t)0, + src_, (size_t)0, (size_t)0, (size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, count_, (size_t)1, (size_t)1); label = buffer; } else { label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," + @@ -1343,8 +1386,8 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D { static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -1357,7 +1400,7 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D { if (status != hipSuccess) { return status; } - status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *queue); + status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream); if (status != hipSuccess) { return status; } @@ -1403,18 +1446,6 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D { return SetParams(memcpyNode->dst_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_, memcpyNode->kind_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(void* dst, const void* symbol, size_t count, size_t offset, - hipMemcpyKind kind) { - size_t sym_size = 0; - hipDeviceptr_t device_ptr = nullptr; - - hipError_t status = ihipMemcpySymbol_validate(symbol, count, offset, sym_size, device_ptr); - if (status != hipSuccess) { - return status; - } - return hipGraphMemcpyNode1D::SetCommandParams(dst, device_ptr, count, kind); - } }; class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { const void* symbol_; @@ -1433,8 +1464,8 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { return new hipGraphMemcpyNodeToSymbol(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } @@ -1447,7 +1478,7 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { if (status != hipSuccess) { return status; } - status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *queue); + status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream); if (status != hipSuccess) { return status; } @@ -1491,18 +1522,6 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D { return SetParams(memcpyNode->src_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_, memcpyNode->kind_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const void* symbol, const void* src, size_t count, size_t offset, - hipMemcpyKind kind) { - size_t sym_size = 0; - hipDeviceptr_t device_ptr = nullptr; - - hipError_t status = ihipMemcpySymbol_validate(symbol, count, offset, sym_size, device_ptr); - if (status != hipSuccess) { - return status; - } - return hipGraphMemcpyNode1D::SetCommandParams(device_ptr, src, count, kind); - } }; class hipGraphMemsetNode : public hipGraphNode { @@ -1562,21 +1581,21 @@ class hipGraphMemsetNode : public hipGraphNode { } } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } if (pMemsetParams_->height == 1) { size_t sizeBytes = pMemsetParams_->width * pMemsetParams_->elementSize; hipError_t status = ihipMemsetCommand(commands_, pMemsetParams_->dst, pMemsetParams_->value, - pMemsetParams_->elementSize, sizeBytes, queue); + pMemsetParams_->elementSize, sizeBytes, stream); } else { hipError_t status = ihipMemset3DCommand( commands_, {pMemsetParams_->dst, pMemsetParams_->pitch, pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height}, - pMemsetParams_->value, {pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height, 1}, queue, pMemsetParams_->elementSize); + pMemsetParams_->value, {pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height, 1}, stream, pMemsetParams_->elementSize); } return status; } @@ -1635,15 +1654,15 @@ class hipGraphEventRecordNode : public hipGraphNode { return new hipGraphEventRecordNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } hip::Event* e = reinterpret_cast(event_); commands_.reserve(1); amd::Command* command = nullptr; - status = e->recordCommand(command, queue); + status = e->recordCommand(command, stream); commands_.emplace_back(command); return status; } @@ -1673,16 +1692,6 @@ class hipGraphEventRecordNode : public hipGraphNode { static_cast(node); return SetParams(eventRecordNode->event_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(hipEvent_t event) { - amd::HostQueue* queue; - if (!commands_.empty()) { - queue = commands_[0]->queue(); - commands_[0]->release(); - } - commands_.clear(); - return CreateCommand(queue); - } }; class hipGraphEventWaitNode : public hipGraphNode { @@ -1698,15 +1707,15 @@ class hipGraphEventWaitNode : public hipGraphNode { return new hipGraphEventWaitNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } hip::Event* e = reinterpret_cast(event_); commands_.reserve(1); amd::Command* command; - status = e->streamWaitCommand(command, queue); + status = e->streamWaitCommand(command, stream); commands_.emplace_back(command); return status; } @@ -1735,16 +1744,6 @@ class hipGraphEventWaitNode : public hipGraphNode { const hipGraphEventWaitNode* eventWaitNode = static_cast(node); return SetParams(eventWaitNode->event_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(hipEvent_t event) { - amd::HostQueue* queue; - if (!commands_.empty()) { - queue = commands_[0]->queue(); - commands_[0]->release(); - } - commands_.clear(); - return CreateCommand(queue); - } }; class hipGraphHostNode : public hipGraphNode { @@ -1765,14 +1764,14 @@ class hipGraphHostNode : public hipGraphNode { return new hipGraphHostNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } amd::Command::EventWaitList waitList; commands_.reserve(1); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); commands_.emplace_back(command); return hipSuccess; } @@ -1814,8 +1813,6 @@ class hipGraphHostNode : public hipGraphNode { const hipGraphHostNode* hostNode = static_cast(node); return SetParams(hostNode->pNodeParams_); } - // ToDo: use this when commands are cloned and command params are to be updated - hipError_t SetCommandParams(const hipHostNodeParams* params); }; class hipGraphEmptyNode : public hipGraphNode { @@ -1827,14 +1824,14 @@ class hipGraphEmptyNode : public hipGraphNode { return new hipGraphEmptyNode(static_cast(*this)); } - hipError_t CreateCommand(amd::HostQueue* queue) { - hipError_t status = hipGraphNode::CreateCommand(queue); + hipError_t CreateCommand(hip::Stream* stream) { + hipError_t status = hipGraphNode::CreateCommand(stream); if (status != hipSuccess) { return status; } amd::Command::EventWaitList waitList; commands_.reserve(1); - amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList); + amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList); commands_.emplace_back(command); return hipSuccess; } @@ -1854,11 +1851,9 @@ class hipGraphMemAllocNode : public hipGraphNode { return new hipGraphMemAllocNode(static_cast(*this)); } - virtual hipError_t CreateCommand(amd::HostQueue* queue) { - auto error = hipGraphNode::CreateCommand(queue); - // Note: memory pool can work with hip::Streams only. It can't accept amd::HostQueue. - // Resource tracking is disabled! - auto ptr = Execute(); + virtual hipError_t CreateCommand(hip::Stream* stream) { + auto error = hipGraphNode::CreateCommand(stream); + auto ptr = Execute(stream_); return error; } @@ -1897,15 +1892,13 @@ class hipGraphMemFreeNode : public hipGraphNode { return new hipGraphMemFreeNode(static_cast(*this)); } - virtual hipError_t CreateCommand(amd::HostQueue* queue) { - auto error = hipGraphNode::CreateCommand(queue); - // Note: memory pool can work with hip::Streams only. It can't accept amd::HostQueue. - // Resource tracking is disabled! - Execute(); + virtual hipError_t CreateCommand(hip::Stream* stream) { + auto error = hipGraphNode::CreateCommand(stream); + Execute(stream_); return error; } - void Execute(hip::Stream* stream = nullptr) { + void Execute(hip::Stream* stream) { auto graph = GetParentGraph(); if (graph != nullptr) { graph->FreeMemory(device_ptr_, stream); diff --git a/src/hip_hcc.def.in b/src/hip_hcc.def.in index 187c0fdd..fe219359 100644 --- a/src/hip_hcc.def.in +++ b/src/hip_hcc.def.in @@ -103,6 +103,9 @@ hipMemPoolImportPointer hipArrayCreate hipArray3DCreate hipArrayDestroy +hipArrayGetInfo +hipArrayGetDescriptor +hipArray3DGetDescriptor hipMallocArray hipMemAdvise hipMemAllocPitch @@ -191,6 +194,7 @@ hipStreamCreate hipStreamCreateWithFlags hipStreamCreateWithPriority hipStreamDestroy +hipStreamGetDevice hipStreamGetFlags hipStreamQuery hipStreamSynchronize diff --git a/src/hip_hcc.map.in b/src/hip_hcc.map.in index a9adf372..204b139f 100644 --- a/src/hip_hcc.map.in +++ b/src/hip_hcc.map.in @@ -169,6 +169,7 @@ global: hipStreamCreateWithFlags; hipStreamCreateWithPriority; hipStreamDestroy; + hipStreamGetDevice; hipStreamGetFlags; hipStreamQuery; hipStreamSynchronize; @@ -517,3 +518,12 @@ global: local: *; } hip_5.3; + +hip_5.6 { +global: + hipArrayGetInfo; + hipArrayGetDescriptor; + hipArray3DGetDescriptor; +local: + *; +} hip_5.5; \ No newline at end of file diff --git a/src/hip_hmm.cpp b/src/hip_hmm.cpp index 00687f84..ec201663 100644 --- a/src/hip_hmm.cpp +++ b/src/hip_hmm.cpp @@ -94,7 +94,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, HIP_RETURN(hipErrorInvalidDevice); } - amd::HostQueue* queue = nullptr; + hip::Stream* hip_stream = nullptr; amd::Device* dev = nullptr; bool cpu_access = false; @@ -106,19 +106,19 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, // Pick the specified stream or Null one from the provided device if (device == hipCpuDeviceId) { cpu_access = true; - queue = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : hip::getQueue(stream); + hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : hip::getStream(stream); } else { dev = g_devices[device]->devices()[0]; - queue = (stream == nullptr) ? g_devices[device]->NullStream() : hip::getQueue(stream); + hip_stream = (stream == nullptr) ? g_devices[device]->NullStream() : hip::getStream(stream); } - if (queue == nullptr) { + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } amd::Command::EventWaitList waitList; amd::SvmPrefetchAsyncCommand* command = - new amd::SvmPrefetchAsyncCommand(*queue, waitList, dev_ptr, count, dev, cpu_access); + new amd::SvmPrefetchAsyncCommand(*hip_stream, waitList, dev_ptr, count, dev, cpu_access); if (command == nullptr) { return hipErrorOutOfMemory; } @@ -233,8 +233,8 @@ hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align) { return hipSuccess; } - assert((hip::host_device->asContext()!= nullptr) && "Current host context must be valid"); - amd::Context& ctx = *hip::host_device->asContext(); + assert((hip::host_context != nullptr) && "Current host context must be valid"); + amd::Context& ctx = *hip::host_context; const amd::Device& dev = *ctx.devices()[0]; diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp index 99bdf00d..ca924bd5 100644 --- a/src/hip_internal.hpp +++ b/src/hip_internal.hpp @@ -225,12 +225,11 @@ class stream_per_thread { namespace hip { class Device; class MemoryPool; - class Stream { + class Stream : public amd::HostQueue { public: enum Priority : int { High = -1, Normal = 0, Low = 1 }; private: - amd::HostQueue* queue_; mutable amd::Monitor lock_; Device* device_; Priority priority_; @@ -260,18 +259,20 @@ namespace hip { /// Capture events std::unordered_set captureEvents_; unsigned long long captureID_; + + static inline CommandQueue::Priority convertToQueuePriority(Priority p){ + return p == Priority::High ? amd::CommandQueue::Priority::High : p == Priority::Low ? + amd::CommandQueue::Priority::Low : amd::CommandQueue::Priority::Normal; + } + public: Stream(Device* dev, Priority p = Priority::Normal, unsigned int f = 0, bool null_stream = false, const std::vector& cuMask = {}, hipStreamCaptureStatus captureStatus = hipStreamCaptureStatusNone); - ~Stream(); + /// Creates the hip stream object, including AMD host queue bool Create(); - - /// Get device AMD host queue object. The method can allocate the queue - amd::HostQueue* asHostQueue(bool skip_alloc = false); - - void Finish() const; + virtual bool terminate() override; /// Get device ID associated with the current stream; int DeviceId() const; /// Get HIP device associated with the stream @@ -378,18 +379,23 @@ namespace hip { parallelCaptureStreams_.erase(it); } } + static bool existsActiveStreamForDevice(hip::Device* device); + + /// The stream should be destroyed via release() rather than delete + private: + ~Stream() {}; }; /// HIP Device class class Device { - amd::Monitor lock_{"Device lock"}; + amd::Monitor lock_{"Device lock", true}; /// ROCclr context amd::Context* context_; /// Device's ID /// Store it here so we don't have to loop through the device list every time int deviceId_; /// ROCclr host queue for default streams - Stream null_stream_; + Stream* null_stream_ = nullptr; /// Store device flags unsigned int flags_; /// Maintain list of user enabled peers @@ -398,21 +404,21 @@ namespace hip { /// True if this device is active bool isActive_; - std::vector queues_; - MemoryPool* default_mem_pool_; + MemoryPool* default_mem_pool_; //!< Default memory pool for this device MemoryPool* current_mem_pool_; + MemoryPool* graph_mem_pool_; //!< Memory pool, associated with graphs for this device std::set mem_pools_; public: Device(amd::Context* ctx, int devId): context_(ctx), deviceId_(devId), - null_stream_(this, Stream::Priority::Normal, 0, true), - flags_(hipDeviceScheduleSpin), + flags_(hipDeviceScheduleSpin), isActive_(false), default_mem_pool_(nullptr), - current_mem_pool_(nullptr) + current_mem_pool_(nullptr), + graph_mem_pool_(nullptr) { assert(ctx != nullptr); } ~Device(); @@ -445,22 +451,16 @@ namespace hip { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - amd::HostQueue* NullStream(bool skip_alloc = false); - Stream* GetNullStream(); + hip::Stream* NullStream(bool skip_alloc = false); + Stream* GetNullStream(); - void SaveQueue(amd::HostQueue* queue) { - amd::ScopedLock lock(lock_); - queues_.push_back(queue); - } bool GetActiveStatus() { amd::ScopedLock lock(lock_); if (isActive_) return true; - for (int i = 0; i < queues_.size(); i++) { - if (queues_[i]->GetQueueStatus()) { - isActive_ = true; - return true; - } + if (Stream::existsActiveStreamForDevice(this)) { + isActive_ = true; + return true; } return false; } @@ -476,6 +476,9 @@ namespace hip { /// Get the default memory pool on the device MemoryPool* GetDefaultMemoryPool() const { return default_mem_pool_; } + /// Get the graph memory pool on the device + MemoryPool* GetGraphMemoryPool() const { return graph_mem_pool_; } + /// Add memory pool to the device void AddMemoryPool(MemoryPool* pool); @@ -490,6 +493,7 @@ namespace hip { /// Removes a destroyed stream from the safe list of memory pools void RemoveStreamFromPools(Stream* stream); + }; /// Thread Local Storage Variables Aggregator Class @@ -513,7 +517,7 @@ namespace hip { extern thread_local TlsAggregator tls; /// Device representing the host - for pinned memory - extern Device* host_device; + extern amd::Context* host_context; extern bool init(); @@ -524,11 +528,11 @@ namespace hip { /// Get ROCclr queue associated with hipStream /// Note: This follows the CUDA spec to sync with default streams /// and Blocking streams - extern amd::HostQueue* getQueue(hipStream_t stream); + extern hip::Stream* getStream(hipStream_t stream); /// Get default stream associated with the ROCclr context - extern amd::HostQueue* getNullStream(amd::Context&); + extern hip::Stream* getNullStream(amd::Context&); /// Get default stream of the thread - extern amd::HostQueue* getNullStream(); + extern hip::Stream* getNullStream(); /// Get device ID associated with the ROCclr context int getDeviceID(amd::Context& ctx); /// Check if stream is valid @@ -542,7 +546,7 @@ extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void* /// Wait all active streams on the blocking queue. The method enqueues a wait command and /// doesn't stall the current thread -extern void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream = false); +extern void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false); extern std::vector g_devices; extern hipError_t ihipDeviceGetCount(int* count); @@ -561,7 +565,8 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t de extern hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId); extern hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr, uint64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes); - +hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, + hip::Stream& stream, bool isHostAsync = false, bool isGPUAsync = true); constexpr bool kOptionChangeable = true; constexpr bool kNewDevProg = false; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index f3fcf68f..79295437 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -78,9 +78,9 @@ hipError_t ihipFree(void *ptr) { auto dev = g_devices[device_id]; // Skip stream allocation, since if it wasn't allocated until free, then the device wasn't used constexpr bool SkipStreamAlloc = true; - amd::HostQueue* queue = dev->NullStream(SkipStreamAlloc); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = dev->NullStream(SkipStreamAlloc); + if (stream != nullptr) { + stream->finish(); } hip::Stream::syncNonBlockingStreams(device_id); // Find out if memory belongs to any memory pool @@ -195,15 +195,15 @@ hipError_t hipSignalExternalSemaphoresAsync( if (extSemArray == nullptr || paramsArray == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } for (unsigned int i = 0; i < numExtSems; i++) { if (extSemArray[i] != nullptr) { amd::ExternalSemaphoreCmd* command = - new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value, + new amd::ExternalSemaphoreCmd(*hip_stream, extSemArray[i], paramsArray[i].params.fence.value, amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE); if (command == nullptr) { return hipErrorOutOfMemory; @@ -227,15 +227,15 @@ hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemAr if (extSemArray == nullptr || paramsArray == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } for (unsigned int i = 0; i < numExtSems; i++) { if (extSemArray[i] != nullptr) { amd::ExternalSemaphoreCmd* command = - new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value, + new amd::ExternalSemaphoreCmd(*hip_stream, extSemArray[i], paramsArray[i].params.fence.value, amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE); if (command == nullptr) { return hipErrorOutOfMemory; @@ -274,7 +274,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) bool useHostDevice = (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) != 0; amd::Context* curDevContext = hip::getCurrentDevice()->asContext(); - amd::Context* amdContext = useHostDevice ? hip::host_device->asContext() : curDevContext; + amd::Context* amdContext = useHostDevice ? hip::host_context : curDevContext; if (amdContext == nullptr) { return hipErrorOutOfMemory; @@ -343,35 +343,35 @@ hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, } hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, amd::HostQueue& queue, bool isAsync) { + hipMemcpyKind kind, hip::Stream& stream, bool isAsync) { amd::Command::EventWaitList waitList; size_t sOffset = 0; amd::Memory* srcMemory = getMemoryObject(src, sOffset); size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst, dOffset); - amd::Device* queueDevice = &queue.device(); + amd::Device* queueDevice = &stream.device(); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - amd::HostQueue* pQueue = &queue; + hip::Stream* pStream = &stream; if (queueDevice != dstMemory->getContext().devices()[0]) { - pQueue = hip::getNullStream(dstMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(dstMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } - command = new amd::WriteMemoryCommand(*pQueue, CL_COMMAND_WRITE_BUFFER, waitList, + command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList, *dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0, copyMetadata); } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - amd::HostQueue* pQueue = &queue; + hip::Stream* pStream = &stream; if (queueDevice != srcMemory->getContext().devices()[0]) { - pQueue = hip::getNullStream(srcMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } - command = new amd::ReadMemoryCommand(*pQueue, CL_COMMAND_READ_BUFFER, waitList, + command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList, *srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0, copyMetadata); } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) { // Check if the queue device doesn't match the device on any memory object. @@ -380,7 +380,7 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) && ((srcMemory->getContext().devices().size() == 1) && (dstMemory->getContext().devices().size() == 1))) { - command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList, + command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes); if (command == nullptr) { return hipErrorOutOfMemory; @@ -392,12 +392,12 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, return hipErrorInvalidValue; } } else { - amd::HostQueue* pQueue = &queue; + hip::Stream* pStream = &stream; if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) && (queueDevice != srcMemory->getContext().devices()[0])) { copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::NONE; - pQueue = hip::getNullStream(srcMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } @@ -405,22 +405,22 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, // Scenarios such as DtoH where dst is pinned memory if ((queueDevice != srcMemory->getContext().devices()[0]) && (dstMemory->getContext().devices().size() != 1)) { - pQueue = hip::getNullStream(srcMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(srcMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } // Scenarios such as HtoD where src is pinned memory } else if ((queueDevice != dstMemory->getContext().devices()[0]) && (srcMemory->getContext().devices().size() != 1)) { - pQueue = hip::getNullStream(dstMemory->getContext()); - amd::Command* cmd = queue.getLastQueuedCommand(true); + pStream = hip::getNullStream(dstMemory->getContext()); + amd::Command* cmd = stream.getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); } } } - command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList, + command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList, *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes, copyMetadata); } @@ -445,13 +445,13 @@ bool IsHtoHMemcpy(void* dst, const void* src, hipMemcpyKind kind) { } return false; } -void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, amd::HostQueue& queue) { - queue.finish(); +void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream) { + stream.finish(); memcpy(dst, src, sizeBytes); } // ================================================================================================ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false) { + hip::Stream& stream, bool isHostAsync, bool isGPUAsync) { hipError_t status; if (sizeBytes == 0) { // Skip if nothing needs writing. @@ -469,29 +469,39 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin size_t dOffset = 0; amd::Memory* dstMemory = getMemoryObject(dst, dOffset); if (srcMemory == nullptr && dstMemory == nullptr) { - ihipHtoHMemcpy(dst, src, sizeBytes, queue); + ihipHtoHMemcpy(dst, src, sizeBytes, stream); return hipSuccess; } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) { - isAsync = false; + isHostAsync = false; } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) { - isAsync = false; + isHostAsync = false; } + amd::Command* command = nullptr; - status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, queue, isAsync); + status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync); if (status != hipSuccess) { return status; } command->enqueue(); - if (!isAsync) { + if (!isHostAsync) { command->awaitCompletion(); + } else if (!isGPUAsync) { + hip::Stream* pStream = hip::getNullStream(dstMemory->getContext()); + amd::Command::EventWaitList waitList; + waitList.push_back(command); + amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList); + if (depdentMarker != nullptr) { + depdentMarker->enqueue(); + depdentMarker->release(); + } } else { amd::HostQueue* newQueue = command->queue(); - if (newQueue != &queue) { + if (newQueue != &stream) { amd::Command::EventWaitList waitList; amd::Command* cmd = newQueue->getLastQueuedCommand(true); if (cmd != nullptr) { waitList.push_back(cmd); - amd::Command* depdentMarker = new amd::Marker(queue, true, waitList); + amd::Command* depdentMarker = new amd::Marker(stream, true, waitList); if (depdentMarker != nullptr) { depdentMarker->enqueue(); depdentMarker->release(); @@ -512,7 +522,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag if (flags == hipDeviceMallocDefault) { ihipFlags = 0; } else if (flags == hipDeviceMallocFinegrained) { - ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN; + ihipFlags = CL_MEM_SVM_ATOMICS; + } else if (flags == hipDeviceMallocUncached) { + ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_UNCACHED; } else if (flags == hipMallocSignalMemory) { ihipFlags = CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER | ROCCLR_MEM_HSA_SIGNAL_MEMORY; if (sizeBytes != 8) { @@ -594,18 +606,18 @@ hipError_t hipFree(void* ptr) { hipError_t hipMemcpy_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream = nullptr) { CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = nullptr; + hip::Stream* hip_stream = nullptr; if (stream != nullptr) { - queue = hip::getQueue(stream); + hip_stream = hip::getStream(stream); } else { - queue = hip::getNullStream(); + hip_stream = hip::getNullStream(); } - if (queue == nullptr) { + if (hip_stream == nullptr) { return hipErrorInvalidValue; } - return ihipMemcpy(dst, src, sizeBytes, kind, *queue); + return ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream); } hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -626,12 +638,12 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, HIP_RETURN(hipErrorContextIsDestroyed); } - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false)); + HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream, false)); } hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) { @@ -680,9 +692,9 @@ hipError_t ihipArrayDestroy(hipArray* array) { } for (auto& dev : g_devices) { - amd::HostQueue* queue = dev->NullStream(true); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = dev->NullStream(true); + if (stream != nullptr) { + stream->finish(); } } @@ -1030,6 +1042,7 @@ hipError_t ihipArrayCreate(hipArray** array, (*array)->depth = pAllocateArray->Depth; (*array)->Format = pAllocateArray->Format; (*array)->NumChannels = pAllocateArray->NumChannels; + (*array)->flags = pAllocateArray->Flags; { amd::ScopedLock lock(hip::hipArraySetLock); hip::hipArraySet.insert(*array); @@ -1138,7 +1151,7 @@ hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) if (hostPtr == nullptr || sizeBytes == 0 || flags > 15) { return hipErrorInvalidValue; } else { - amd::Memory* mem = new (*hip::host_device->asContext()) amd::Buffer(*hip::host_device->asContext(), + amd::Memory* mem = new (*hip::host_context) amd::Buffer(*hip::host_context, CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS, sizeBytes); constexpr bool sysMemAlloc = false; @@ -1187,9 +1200,9 @@ hipError_t ihipHostUnregister(void* hostPtr) { // Wait on the device, associated with the current memory object during allocation auto device_id = mem->getUserData().deviceId; - amd::HostQueue* queue = g_devices[device_id]->NullStream(true); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = g_devices[device_id]->NullStream(true); + if (stream != nullptr) { + stream->finish(); } amd::MemObjMap::RemoveMemObj(hostPtr); @@ -1374,11 +1387,11 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount); CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getQueue(nullptr); - if (queue == nullptr) { + hip::Stream* stream = hip::getStream(nullptr); + if (stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *stream)); } hipError_t hipMemcpyDtoH(void* dstHost, @@ -1386,11 +1399,11 @@ hipError_t hipMemcpyDtoH(void* dstHost, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount); CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getQueue(nullptr); - if (queue == nullptr) { + hip::Stream* stream = hip::getStream(nullptr); + if (stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *stream)); } hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, @@ -1398,22 +1411,22 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice, size_t ByteCount) { HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount); CHECK_STREAM_CAPTURING(); - amd::HostQueue* queue = hip::getQueue(nullptr); - if (queue == nullptr) { + hip::Stream* stream = hip::getStream(nullptr); + if (stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *queue)); + HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *stream)); } hipError_t hipMemcpyAsync_common(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { STREAM_CAPTURE(hipMemcpyAsync, stream, dst, src, sizeBytes, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } - return ihipMemcpy(dst, src, sizeBytes, kind, *queue, true); + return ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream, true); } hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, @@ -1434,12 +1447,12 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, void* srcHost, size_t By HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream); hipMemcpyKind kind = hipMemcpyHostToDevice; STREAM_CAPTURE(hipMemcpyHtoDAsync, stream, dstDevice, srcHost, ByteCount, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN_DURATION( - ihipMemcpy(dstDevice, srcHost, ByteCount, kind, *queue, true)); + ihipMemcpy(dstDevice, srcHost, ByteCount, kind, *hip_stream, true)); } hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, @@ -1447,12 +1460,12 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream); hipMemcpyKind kind = hipMemcpyDeviceToDevice; STREAM_CAPTURE(hipMemcpyDtoDAsync, stream, dstDevice, srcDevice, ByteCount, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN_DURATION( - ihipMemcpy(dstDevice, srcDevice, ByteCount, kind, *queue, true)); + ihipMemcpy(dstDevice, srcDevice, ByteCount, kind, *hip_stream, true)); } hipError_t hipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, @@ -1460,12 +1473,12 @@ hipError_t hipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t By HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream); hipMemcpyKind kind = hipMemcpyDeviceToHost; STREAM_CAPTURE(hipMemcpyDtoHAsync, stream, dstHost, srcDevice, ByteCount, kind); - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { HIP_RETURN(hipErrorInvalidValue); } HIP_RETURN_DURATION( - ihipMemcpy(dstHost, srcDevice, ByteCount, kind, *queue, true)); + ihipMemcpy(dstHost, srcDevice, ByteCount, kind, *hip_stream, true)); } hipError_t ihipMemcpyAtoDValidate(hipArray* srcArray, void* dstDevice, amd::Coord3D& srcOrigin, @@ -1514,7 +1527,7 @@ hipError_t ihipMemcpyAtoDValidate(hipArray* srcArray, void* dstDevice, amd::Coor hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, void* dstDevice, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, - amd::HostQueue* queue) { + hip::Stream* stream) { amd::BufferRect srcRect; amd::BufferRect dstRect; amd::Memory* dstMemory; @@ -1526,7 +1539,7 @@ hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, voi return status; } - amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE_TO_BUFFER, + amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{}, *srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); @@ -1588,7 +1601,7 @@ hipError_t ihipMemcpyDtoAValidate(void* srcDevice, hipArray* dstArray, amd::Coor hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArray* dstArray, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - amd::HostQueue* queue) { + hip::Stream* stream) { amd::Image* dstImage; amd::Memory* srcMemory; amd::BufferRect dstRect; @@ -1599,7 +1612,7 @@ hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArr if (status != hipSuccess) { return status; } - amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_BUFFER_TO_IMAGE, + amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{}, *srcMemory, *dstImage, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect); @@ -1661,7 +1674,7 @@ hipError_t ihipMemcpyDtoDValidate(void* srcDevice, void* dstDevice, amd::Coord3D hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* dstDevice, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue) { + size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream) { amd::Memory* srcMemory; amd::Memory* dstMemory; amd::BufferRect srcRect; @@ -1676,7 +1689,7 @@ hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::Coord3D dstStart(dstRect.start_, 0, 0); amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand( - *queue, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory, + *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory, srcStart, dstStart, copyRegion, srcRect, dstRect); if (copyCommand == nullptr) { @@ -1726,7 +1739,7 @@ hipError_t ihipMemcpyDtoHValidate(void* srcDevice, void* dstHost, amd::Coord3D& hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* dstHost, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue, + size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream, bool isAsync = false) { amd::Memory* srcMemory; amd::BufferRect srcRect; @@ -1740,7 +1753,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); amd::ReadMemoryCommand* readCommand = - new amd::ReadMemoryCommand(*queue, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{}, + new amd::ReadMemoryCommand(*stream, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect, copyMetadata); @@ -1791,7 +1804,7 @@ hipError_t ihipMemcpyHtoDValidate(const void* srcHost, void* dstDevice, amd::Coo hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, void* dstDevice, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue, + size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream, bool isAsync = false) { amd::Memory* dstMemory; amd::BufferRect srcRect; @@ -1806,7 +1819,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo amd::Coord3D dstStart(dstRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand( - *queue, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart, + *stream, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart, copyRegion, srcHost, dstRect, srcRect, copyMetadata); if (writeCommand == nullptr) { @@ -1824,7 +1837,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - amd::HostQueue* queue) { + hip::Stream* stream) { if ((srcHost == nullptr) || (dstHost == nullptr)) { return hipErrorInvalidValue; } @@ -1841,8 +1854,8 @@ hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOr return hipErrorInvalidValue; } - if (queue) { - queue->finish(); + if (stream) { + stream->finish(); } for (size_t slice = 0; slice < copyRegion[2]; slice++) { @@ -1891,7 +1904,7 @@ hipError_t ihipMemcpyAtoAValidate(hipArray* srcArray, hipArray* dstArray, amd::C hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hipArray* dstArray, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, - amd::Coord3D copyRegion, amd::HostQueue* queue) { + amd::Coord3D copyRegion, hip::Stream* stream) { amd::Image* srcImage; amd::Image* dstImage; @@ -1901,7 +1914,7 @@ hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hip return status; } - amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE, + amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_IMAGE, amd::Command::EventWaitList{}, *srcImage, *dstImage, srcOrigin, dstOrigin, copyRegion); @@ -1950,7 +1963,7 @@ hipError_t ihipMemcpyHtoAValidate(const void* srcHost, hipArray* dstArray, hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hipArray* dstArray, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, - amd::HostQueue* queue, bool isAsync = false) { + hip::Stream* stream, bool isAsync = false) { amd::Image* dstImage; amd::BufferRect srcRect; @@ -1962,7 +1975,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand( - *queue, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin, + *stream, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin, copyRegion, static_cast(srcHost) + srcRect.start_, srcRowPitch, srcSlicePitch, copyMetadata); @@ -2011,7 +2024,7 @@ hipError_t ihipMemcpyAtoHValidate(hipArray* srcArray, void* dstHost, amd::Coord3 hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, void* dstHost, amd::Coord3D srcOrigin, amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, - amd::HostQueue* queue, bool isAsync = false) { + hip::Stream* stream, bool isAsync = false) { amd::Image* srcImage; amd::BufferRect dstRect; amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); @@ -2023,7 +2036,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, voi } amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand( - *queue, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin, + *stream, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin, copyRegion, static_cast(dstHost) + dstRect.start_, dstRowPitch, dstSlicePitch, copyMetadata); @@ -2040,7 +2053,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, voi } hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3D* pCopy, - amd::HostQueue* queue) { + hip::Stream* stream) { // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the // (unified virtual address space) base address of the source data and the bytes per row to apply. // {src/dst}Array is ignored. @@ -2088,41 +2101,41 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3 // Host to Device. return ihipMemcpyHtoDCommand(command, pCopy->srcHost, pCopy->dstDevice, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue); + pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream); } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) { // Device to Host. return ihipMemcpyDtoHCommand(command, pCopy->srcDevice, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue); + pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream); } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) { // Device to Device. return ihipMemcpyDtoDCommand(command, pCopy->srcDevice, pCopy->dstDevice, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue); + pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream); } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) { // Host to Image. return ihipMemcpyHtoACommand(command, pCopy->srcHost, pCopy->dstArray, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) { // Image to Host. return ihipMemcpyAtoHCommand(command, pCopy->srcArray, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) { // Device to Image. return ihipMemcpyDtoACommand(command, pCopy->srcDevice, pCopy->dstArray, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) { // Image to Device. return ihipMemcpyAtoDCommand(command, pCopy->srcArray, pCopy->dstDevice, srcOrigin, dstOrigin, copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, - queue); + stream); } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) { // Image to Image. return ihipMemcpyAtoACommand(command, pCopy->srcArray, pCopy->dstArray, srcOrigin, dstOrigin, - copyRegion, queue); + copyRegion, stream); } else { ShouldNotReachHere(); } @@ -2194,14 +2207,14 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool // Host to Host. return ihipMemcpyHtoH(pCopy->srcHost, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, pCopy->dstPitch, - pCopy->dstPitch * pCopy->dstHeight, hip::getQueue(stream)); + pCopy->dstPitch * pCopy->dstHeight, hip::getStream(stream)); } else { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } - status = ihipGetMemcpyParam3DCommand(command, pCopy, queue); + status = ihipGetMemcpyParam3DCommand(command, pCopy, hip_stream); if (status != hipSuccess) return status; // Transfers from device memory to pageable host memory and transfers from any host memory to any host memory @@ -2489,13 +2502,13 @@ hipError_t ihipMemcpyAtoD(hipArray* srcArray, void* dstDevice, amd::Coord3D srcO amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyAtoDCommand(command, srcArray, dstDevice, srcOrigin, dstOrigin, copyRegion, - dstRowPitch, dstSlicePitch, queue); + dstRowPitch, dstSlicePitch, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2503,13 +2516,13 @@ hipError_t ihipMemcpyDtoA(void* srcDevice, hipArray* dstArray, amd::Coord3D srcO amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyDtoACommand(command, srcDevice, dstArray, srcOrigin, dstOrigin, copyRegion, - srcRowPitch, srcSlicePitch, queue); + srcRowPitch, srcSlicePitch, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2518,13 +2531,13 @@ hipError_t ihipMemcpyDtoD(void* srcDevice, void* dstDevice, amd::Coord3D srcOrig size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyDtoDCommand(command, srcDevice, dstDevice, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, - dstSlicePitch, queue); + dstSlicePitch, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2533,13 +2546,13 @@ hipError_t ihipMemcpyDtoH(void* srcDevice, void* dstHost, amd::Coord3D srcOrigin size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyDtoHCommand(command, srcDevice, dstHost, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, - dstSlicePitch, queue, isAsync); + dstSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2548,13 +2561,13 @@ hipError_t ihipMemcpyHtoD(const void* srcHost, void* dstDevice, amd::Coord3D src size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyHtoDCommand(command, srcHost, dstDevice, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, - dstSlicePitch, queue, isAsync); + dstSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2562,12 +2575,12 @@ hipError_t ihipMemcpyAtoA(hipArray* srcArray, hipArray* dstArray, amd::Coord3D s amd::Coord3D dstOrigin, amd::Coord3D copyRegion, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyAtoACommand(command, srcArray, dstArray, srcOrigin, dstOrigin, - copyRegion, queue); + copyRegion, hip_stream); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2575,13 +2588,13 @@ hipError_t ihipMemcpyHtoA(const void* srcHost, hipArray* dstArray, amd::Coord3D amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyHtoACommand(command, srcHost, dstArray, srcOrigin, dstOrigin, copyRegion, - srcRowPitch, srcSlicePitch, queue, isAsync); + srcRowPitch, srcSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2589,13 +2602,13 @@ hipError_t ihipMemcpyAtoH(hipArray* srcArray, void* dstHost, amd::Coord3D srcOri amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) { amd::Command* command; - amd::HostQueue* queue = hip::getQueue(stream); - if (queue == nullptr) { + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { return hipErrorInvalidValue; } hipError_t status = ihipMemcpyAtoHCommand(command, srcArray, dstHost, srcOrigin, dstOrigin, copyRegion, - dstRowPitch, dstSlicePitch, queue, isAsync); + dstRowPitch, dstSlicePitch, hip_stream, isAsync); if (status != hipSuccess) return status; return ihipMemcpyCmdEnqueue(command, isAsync); } @@ -2655,9 +2668,9 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p) { } hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p, - amd::HostQueue* queue) { + hip::Stream* stream) { const HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*p); - return ihipGetMemcpyParam3DCommand(command, &desc, queue); + return ihipGetMemcpyParam3DCommand(command, &desc, stream); } hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream, bool isAsync = false) { @@ -2715,8 +2728,8 @@ hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) { hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, size_t offset, int64_t value, size_t valueSize, size_t sizeBytes, - amd::HostQueue* queue) { - if ((memory == nullptr) || (queue == nullptr)) { + hip::Stream* stream) { + if ((memory == nullptr) || (stream == nullptr)) { return hipErrorInvalidValue; } @@ -2726,7 +2739,7 @@ hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, si // surface=[pitch, width, height] amd::Coord3D surface(sizeBytes, sizeBytes, 1); amd::FillMemoryCommand* fillMemCommand = - new amd::FillMemoryCommand(*queue, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(), + new amd::FillMemoryCommand(*stream, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(), &value, valueSize, fillOffset, fillSize, surface); if (fillMemCommand == nullptr) { return hipErrorOutOfMemory; @@ -2792,7 +2805,7 @@ hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams) { } hipError_t ihipMemsetCommand(std::vector& commands, void* dst, int64_t value, - size_t valueSize, size_t sizeBytes, amd::HostQueue* queue) { + size_t valueSize, size_t sizeBytes, hip::Stream* stream) { hipError_t hip_error = hipSuccess; auto aligned_dst = amd::alignUp(reinterpret_cast
(dst), sizeof(uint64_t)); size_t offset = 0; @@ -2802,7 +2815,7 @@ hipError_t ihipMemsetCommand(std::vector& commands, void* dst, in amd::Command* command; hip_error = packFillMemoryCommand(command, memory, offset, value, valueSize, sizeBytes, - queue); + stream); commands.push_back(command); return hip_error; @@ -2836,8 +2849,8 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt } } std::vector commands; - amd::HostQueue* queue = hip::getQueue(stream); - hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, queue); + hip::Stream* hip_stream = hip::getStream(stream); + hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, hip_stream); if (hip_error != hipSuccess) { break; } @@ -2954,13 +2967,13 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte } hipError_t ihipMemset3DCommand(std::vector &commands, hipPitchedPtr pitchedDevPtr, - int value, hipExtent extent, amd::HostQueue* queue, size_t elementSize = 1) { + int value, hipExtent extent, hip::Stream* stream, size_t elementSize = 1) { size_t offset = 0; auto sizeBytes = extent.width * extent.height * extent.depth; amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset); if (pitchedDevPtr.pitch == extent.width) { return ihipMemsetCommand(commands, pitchedDevPtr.ptr, value, elementSize, - static_cast(sizeBytes), queue); + static_cast(sizeBytes), stream); } // Workaround for cases when pitch > row until fill kernel will be updated to support pitch. // Fall back to filling one row at a time. @@ -2976,7 +2989,7 @@ hipError_t ihipMemset3DCommand(std::vector &commands, hipPitchedP } amd::FillMemoryCommand* command; command = new amd::FillMemoryCommand( - *queue, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(), + *stream, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(), &value, elementSize, origin, region, surface); commands.push_back(command); return hipSuccess; @@ -3007,9 +3020,9 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent isAsync = true; } } - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); std::vector commands; - status = ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, queue); + status = ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, hip_stream); if (status != hipSuccess) { return status; } @@ -3524,22 +3537,105 @@ hipError_t hipArrayDestroy(hipArray* array) { HIP_RETURN(ihipArrayDestroy(array)); } -hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, - hipArray* array) { - HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array); +hipError_t ihipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* desc, + hipArray* array) { + { + amd::ScopedLock lock(hip::hipArraySetLock); + if (hip::hipArraySet.find(array) == hip::hipArraySet.end()) { + return hipErrorInvalidHandle; + } + } - assert(false && "Unimplemented"); + desc->Width = array->width; + desc->Height = array->height; + desc->Depth = array->depth; + desc->Format = array->Format; + desc->NumChannels = array->NumChannels; + desc->Flags = array->flags; - HIP_RETURN(hipSuccess); + return hipSuccess; +} + +hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, + hipExtent* extent, + unsigned int* flags, + hipArray* array) { + HIP_INIT_API(hipArrayGetInfo, desc, extent, flags, array); + CHECK_STREAM_CAPTURE_SUPPORTED(); + + if (array == nullptr) { + HIP_RETURN(hipErrorInvalidHandle); + } + + // If all output parameters are nullptr, then no need to proceed further + if ((desc == nullptr) && (extent == nullptr) && (flags == nullptr)) { + HIP_RETURN(hipSuccess); + } + + HIP_ARRAY3D_DESCRIPTOR array3DDescriptor; + hipError_t status = ihipArray3DGetDescriptor(&array3DDescriptor, array); + + // Fill each output parameter + if (status == hipSuccess) { + if (desc != nullptr) { + *desc = hip::getChannelFormatDesc(array3DDescriptor.NumChannels, array3DDescriptor.Format); + } + + if (extent != nullptr) { + extent->width = array3DDescriptor.Width; + extent->height = array3DDescriptor.Height; + extent->depth = array3DDescriptor.Depth; + } + + if (flags != nullptr) { + *flags = array3DDescriptor.Flags; + } + } + + HIP_RETURN(status); } hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor, hipArray* array) { HIP_INIT_API(hipArrayGetDescriptor, pArrayDescriptor, array); + CHECK_STREAM_CAPTURE_SUPPORTED(); + + if (array == nullptr) { + HIP_RETURN(hipErrorInvalidHandle); + } - assert(false && "Unimplemented"); + if (pArrayDescriptor == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } - HIP_RETURN(hipSuccess); + HIP_ARRAY3D_DESCRIPTOR array3DDescriptor; + hipError_t status = ihipArray3DGetDescriptor(&array3DDescriptor, array); + + // Fill each output parameter + if (status == hipSuccess) { + pArrayDescriptor->Width = array3DDescriptor.Width; + pArrayDescriptor->Height = array3DDescriptor.Height; + pArrayDescriptor->Format = array3DDescriptor.Format; + pArrayDescriptor->NumChannels = array3DDescriptor.NumChannels; + } + + HIP_RETURN(status); +} + +hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor, + hipArray* array) { + HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array); + CHECK_STREAM_CAPTURE_SUPPORTED(); + + if (array == nullptr) { + HIP_RETURN(hipErrorInvalidHandle); + } + + if (pArrayDescriptor == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + HIP_RETURN(ihipArray3DGetDescriptor(pArrayDescriptor, array)); } hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, @@ -3845,9 +3941,9 @@ hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) { } for (auto& dev : g_devices) { - amd::HostQueue* queue = dev->NullStream(true); - if (queue != nullptr) { - queue->finish(); + hip::Stream* stream = dev->NullStream(true); + if (stream != nullptr) { + stream->finish(); } } diff --git a/src/hip_mempool.cpp b/src/hip_mempool.cpp index eea254f5..f798f8c8 100644 --- a/src/hip_mempool.cpp +++ b/src/hip_mempool.cpp @@ -70,7 +70,7 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) { if ((dev_ptr == nullptr) || (size == 0) || (!hip::isValid(stream))) { HIP_RETURN(hipErrorInvalidValue); } - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() : + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); auto device = hip_stream->GetDevice(); auto mem_pool = device->GetCurrentMemoryPool(); @@ -92,7 +92,7 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) { auto memory = getMemoryObject(dev_ptr, offset); if (memory != nullptr) { auto id = memory->getUserData().deviceId; - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() : + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); if (!g_devices[id]->FreeMemory(memory, hip_stream)) { //! @todo It's not the most optimal logic. The current implementation has unconditional waits @@ -241,7 +241,7 @@ hipError_t hipMallocFromPoolAsync( STREAM_CAPTURE(hipMallocAsync, stream, mem_pool, size, dev_ptr); auto mpool = reinterpret_cast(mem_pool); - auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() : + auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : reinterpret_cast(stream); *dev_ptr = mpool->AllocateMemory(size, hip_stream); HIP_RETURN(hipSuccess); diff --git a/src/hip_mempool_impl.cpp b/src/hip_mempool_impl.cpp index ddec8f49..2606688f 100644 --- a/src/hip_mempool_impl.cpp +++ b/src/hip_mempool_impl.cpp @@ -397,4 +397,10 @@ void MemoryPool::GetAccess(hip::Device* device, hipMemAccessFlags* flags) { } } +void MemoryPool::FreeAllMemory(hip::Stream* stream) { + while (!busy_heap_.Allocations().empty()) { + FreeMemory(busy_heap_.Allocations().begin()->first, stream); + } +} + } diff --git a/src/hip_mempool_impl.hpp b/src/hip_mempool_impl.hpp index e42bc7eb..5e18cb35 100644 --- a/src/hip_mempool_impl.hpp +++ b/src/hip_mempool_impl.hpp @@ -136,7 +136,7 @@ class Heap : public amd::EmbeddedObject { bool IsActiveMemory(amd::Memory* memory) const { return (allocations_.find(memory) != allocations_.end()); } - + const auto& Allocations() { return allocations_; } private: Heap() = delete; Heap(const Heap&) = delete; @@ -213,6 +213,9 @@ class MemoryPool : public amd::ReferenceCountedObject { /// Set memory pool access by different devices void GetAccess(hip::Device* device, hipMemAccessFlags* flags); + /// Frees all busy memory + void FreeAllMemory(hip::Stream* stream = nullptr); + /// Accessors for the pool state bool EventDependencies() const { return (state_.event_dependencies_) ? true : false; } bool Opportunistic() const { return (state_.opportunistic_) ? true : false; } @@ -223,7 +226,6 @@ class MemoryPool : public amd::ReferenceCountedObject { MemoryPool(const MemoryPool&) = delete; MemoryPool& operator=(const MemoryPool&) = delete; - Heap busy_heap_; //!< Heap of busy allocations Heap free_heap_; //!< Heap of freed allocations struct { diff --git a/src/hip_module.cpp b/src/hip_module.cpp index f3ae2611..a3fa4919 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -305,7 +305,7 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, - amd::HostQueue* queue, void** kernelParams, void** extra, + hip::Stream* stream, void** kernelParams, void** extra, hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr, uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0, @@ -328,7 +328,7 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, } amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand( - *queue, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum, + *stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange); if (!kernelCommand) { return hipErrorOutOfMemory; @@ -336,10 +336,12 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f, // Capture the kernel arguments if (CL_SUCCESS != kernelCommand->captureAndValidate()) { - delete kernelCommand; + kernelCommand->release(); return hipErrorOutOfMemory; } + command = kernelCommand; + return hipSuccess; } @@ -369,9 +371,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, return status; } amd::Command* command = nullptr; - amd::HostQueue* queue = hip::getQueue(hStream); + hip::Stream* hip_stream = hip::getStream(hStream); status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, - blockDimX, blockDimY, blockDimZ, sharedMemBytes, queue, + blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream, kernelParams, extra, startEvent, stopEvent, flags, params, gridId, numGrids, prevGridSum, allGridSum, firstDevice); if (status != hipSuccess) { @@ -386,12 +388,25 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, } } - command->enqueue(); - if (stopEvent != nullptr) { hip::Event* eStop = reinterpret_cast(stopEvent); + if (eStop->flags & hipEventDisableSystemFence) { + command->setEventScope(amd::Device::kCacheStateIgnore); + } else { + command->setEventScope(amd::Device::kCacheStateSystem); + } + // Enqueue Dispatch and bind the stop event + command->enqueue(); eStop->BindCommand(*command, false); + } else { + command->enqueue(); + } + + if (command->status() == CL_INVALID_OPERATION) { + command->release(); + return hipErrorIllegalState; } + command->release(); return hipSuccess; @@ -516,7 +531,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* return hipErrorInvalidValue; } - if ((flags < 0) || (flags > hipCooperativeLaunchMultiDeviceNoPostSync)) { + if (flags > (hipCooperativeLaunchMultiDeviceNoPostSync + + hipCooperativeLaunchMultiDeviceNoPreSync)) { return hipErrorInvalidValue; } @@ -535,8 +551,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* } if (launch.hStream != nullptr) { // Validate devices to make sure it dosn't have duplicates - amd::HostQueue* queue = reinterpret_cast(launch.hStream)->asHostQueue(); - auto device = &queue->vdev()->device(); + hip::Stream* hip_stream = reinterpret_cast(launch.hStream); + auto device = &hip_stream->vdev()->device(); for (int j = 0; j < numDevices; ++j) { if (mgpu_list[j] == device) { return hipErrorInvalidDevice; @@ -553,23 +569,23 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { for (int i = 0; i < numDevices; ++i) { - amd::HostQueue* queue = - reinterpret_cast(launchParamsList[i].hStream)->asHostQueue(); - queue->finish(); + hip::Stream* hip_stream = + reinterpret_cast(launchParamsList[i].hStream); + hip_stream->finish(); } } for (int i = 0; i < numDevices; ++i) { const hipFunctionLaunchParams& launch = launchParamsList[i]; - amd::HostQueue* queue = reinterpret_cast(launch.hStream)->asHostQueue(); + hip::Stream* hip_stream = reinterpret_cast(launch.hStream); if (i == 0) { // The order of devices in the launch may not match the order in the global array for (size_t dev = 0; dev < g_devices.size(); ++dev) { // Find the matching device - if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) { + if (&hip_stream->vdev()->device() == g_devices[dev]->devices()[0]) { // Save ROCclr index of the first device in the launch - firstDevice = queue->vdev()->device().index(); + firstDevice = hip_stream->vdev()->device().index(); break; } } @@ -599,9 +615,9 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* // Sync the execution streams on all devices if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { for (int i = 0; i < numDevices; ++i) { - amd::HostQueue* queue = - reinterpret_cast(launchParamsList[i].hStream)->asHostQueue(); - queue->finish(); + hip::Stream* hip_stream = + reinterpret_cast(launchParamsList[i].hStream); + hip_stream->finish(); } } @@ -730,12 +746,12 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL return hipErrorInvalidValue; } - amd::HostQueue* queue = reinterpret_cast(launch.stream)->asHostQueue(); + hip::Stream* hip_stream = hip::getStream(launch.stream); hipFunction_t func = nullptr; // The order of devices in the launch may not match the order in the global array for (size_t dev = 0; dev < g_devices.size(); ++dev) { // Find the matching device and request the kernel function - if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) { + if (&hip_stream->vdev()->device() == g_devices[dev]->devices()[0]) { IHIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, launch.func, dev)); break; } diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index fb6b3329..17dc65da 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -220,7 +220,8 @@ hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevic HIP_RETURN(hipErrorInvalidDevice); } - HIP_RETURN(hipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice)); + HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(), + true, false)); } hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice, @@ -232,8 +233,14 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src srcDevice < 0 || dstDevice < 0) { HIP_RETURN(hipErrorInvalidDevice); } - - HIP_RETURN(hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream)); + if (!hip::isValid(stream)) { + return hipErrorContextIsDestroyed; + } + hip::Stream* hip_stream = hip::getStream(stream); + if (hip_stream == nullptr) { + return hipErrorInvalidValue; + } + HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip_stream, true, true)); } hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) { diff --git a/src/hip_platform.cpp b/src/hip_platform.cpp index aebd7f99..10b8d3f8 100644 --- a/src/hip_platform.cpp +++ b/src/hip_platform.cpp @@ -33,8 +33,6 @@ PlatformState* PlatformState::platform_; // Initiaized as nullptr by default // forward declaration of methods required for __hipRegisrterManagedVar hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0); -hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false); struct __CudaFatBinaryWrapper { unsigned int magic; @@ -146,9 +144,9 @@ extern "C" void __hipRegisterManagedVar( HIP_INIT_VOID(); hipError_t status = ihipMallocManaged(pointer, size, align); if (status == hipSuccess) { - amd::HostQueue* queue = hip::getNullStream(); - if (queue != nullptr) { - status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *queue); + hip::Stream* stream = hip::getNullStream(); + if (stream != nullptr) { + status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *stream); guarantee((status == hipSuccess), "Error during memcpy to managed memory!"); } else { ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL"); @@ -662,7 +660,10 @@ void PlatformState::init() { initialized_ = true; for (auto& it : statCO_.modules_) { hipError_t err = digestFatBinary(it.first, it.second); - assert(err == hipSuccess); + if (err != hipSuccess) { + HIP_ERROR_PRINT(err); + return; + } } for (auto& it : statCO_.vars_) { it.second->resize_dVar(g_devices.size()); @@ -673,8 +674,6 @@ void PlatformState::init() { } hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, const void* image) { - amd::ScopedLock lock(lock_); - if (module == nullptr) { return hipErrorInvalidValue; } @@ -689,6 +688,7 @@ hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, con *module = dynCo->module(); assert(*module != nullptr); + amd::ScopedLock lock(lock_); if (dynCO_map_.find(*module) != dynCO_map_.end()) { delete dynCo; return hipErrorAlreadyMapped; diff --git a/src/hip_runtime.cpp b/src/hip_runtime.cpp index 4c877806..78eb4aca 100644 --- a/src/hip_runtime.cpp +++ b/src/hip_runtime.cpp @@ -46,9 +46,14 @@ extern "C" BOOL WINAPI DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) { } #endif // DEBUG break; - case DLL_PROCESS_DETACH: + case DLL_PROCESS_DETACH: { + amd::Thread* thread = amd::Thread::current(); + if (!(thread != nullptr || + ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) { + return true; + } ihipDestroyDevice(); - break; + } break; case DLL_THREAD_DETACH: { amd::Thread* thread = amd::Thread::current(); delete thread; diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 3d1e9168..6d085fe7 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -31,7 +31,8 @@ namespace hip { // ================================================================================================ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, const std::vector& cuMask, hipStreamCaptureStatus captureStatus) - : queue_(nullptr), + : amd::HostQueue(*dev->asContext(), *dev->devices()[0], 0, amd::CommandQueue::RealTimeDisabled, + convertToQueuePriority(p), cuMask), lock_("Stream Callback lock"), device_(dev), priority_(p), @@ -40,22 +41,11 @@ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream, cuMask_(cuMask), captureStatus_(captureStatus), originStream_(false), - captureID_(0) {} - -// ================================================================================================ -Stream::~Stream() { - if (queue_ != nullptr) { - amd::ScopedLock lock(streamSetLock); - streamSet.erase(this); - - // Skip queue destruction for null stream in MT. Queue worker thread can be destroyed on - // the app exit, during the stream destruction, causing a race condition. - if (!null_ || AMD_DIRECT_DISPATCH) { - queue_->release(); - queue_ = nullptr; - } - } -} + captureID_(0) + { + amd::ScopedLock lock(streamSetLock); + streamSet.insert(this); + } // ================================================================================================ hipError_t Stream::EndCapture() { @@ -81,38 +71,16 @@ hipError_t Stream::EndCapture() { // ================================================================================================ bool Stream::Create() { - amd::CommandQueue::Priority p; - switch (priority_) { - case Priority::High: - p = amd::CommandQueue::Priority::High; - break; - case Priority::Low: - p = amd::CommandQueue::Priority::Low; - break; - case Priority::Normal: - default: - p = amd::CommandQueue::Priority::Normal; - break; - } - amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], - 0, amd::CommandQueue::RealTimeDisabled, - p, cuMask_); - - // Create a host queue - bool result = (queue != nullptr) ? queue->create() : false; - // Insert just created stream into the list of the blocking queues - if (result) { + return create(); +} + +// ================================================================================================ +bool Stream::terminate() { + { amd::ScopedLock lock(streamSetLock); - streamSet.insert(this); - queue_ = queue; - device_->SaveQueue(queue); - } else if (queue != nullptr) { - // Queue creation has failed, and virtual device associated with the queue may not be created. - // Just need to delete the queue instance. - delete queue; + streamSet.erase(this); } - - return result; + return HostQueue::terminate(); } // ================================================================================================ @@ -134,29 +102,6 @@ bool isValid(hipStream_t& stream) { return true; } -// ================================================================================================ -amd::HostQueue* Stream::asHostQueue(bool skip_alloc) { - if (queue_ != nullptr) { - return queue_; - } - // Access to the stream object is lock protected, because possible allocation - amd::ScopedLock l(Lock()); - if (queue_ == nullptr) { - // Create the host queue for the first time - if (!skip_alloc) { - Create(); - } - } - return queue_; -} - -// ================================================================================================ -void Stream::Finish() const { - if (queue_ != nullptr) { - queue_->finish(); - } -} - // ================================================================================================ int Stream::DeviceId() const { return device_->deviceId(); @@ -180,7 +125,7 @@ void Stream::syncNonBlockingStreams(int deviceId) { for (auto& it : streamSet) { if (it->Flags() & hipStreamNonBlocking) { if (it->DeviceId() == deviceId) { - it->asHostQueue()->finish(); + it->finish(); } } } @@ -207,7 +152,7 @@ void Stream::destroyAllStreams(int deviceId) { } } for (auto& it : toBeDeleted) { - delete it; + it->release(); } } @@ -215,36 +160,48 @@ bool Stream::StreamCaptureOngoing(void) { return (g_allCapturingStreams.empty() == true) ? false : true; } +bool Stream::existsActiveStreamForDevice(hip::Device* device) { + + amd::ScopedLock lock(streamSetLock); + + for (const auto& active_stream : streamSet) { + if ((active_stream->GetDevice() == device) && + active_stream->GetQueueStatus()) { + return true; + } + } + return false; +} + };// hip namespace // ================================================================================================ -void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream) { +void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) { amd::Command::EventWaitList eventWaitList(0); bool submitMarker = 0; { amd::ScopedLock lock(streamSetLock); - for (const auto& stream : streamSet) { - amd::HostQueue* active_queue = stream->asHostQueue(); + for (const auto& active_stream : streamSet) { // If it's the current device - if ((&active_queue->device() == &blocking_queue->device()) && + if ((&active_stream->device() == &blocking_stream->device()) && // Make sure it's a default stream - ((stream->Flags() & hipStreamNonBlocking) == 0) && + ((active_stream->Flags() & hipStreamNonBlocking) == 0) && // and it's not the current stream - (active_queue != blocking_queue) && + (active_stream != blocking_stream) && // check for a wait on the null stream - (stream->Null() == wait_null_stream)) { + (active_stream->Null() == wait_null_stream)) { // Get the last valid command - amd::Command* command = active_queue->getLastQueuedCommand(true); + amd::Command* command = active_stream->getLastQueuedCommand(true); if (command != nullptr) { amd::Event& event = command->event(); // Check HW status of the ROCcrl event. // Note: not all ROCclr modes support HW status - bool ready = active_queue->device().IsHwEventReady(event); + bool ready = active_stream->device().IsHwEventReady(event); if (!ready) { ready = (command->status() == CL_COMPLETE); } - submitMarker |= active_queue->vdev()->isFenceDirty(); + submitMarker |= active_stream->vdev()->isFenceDirty(); // Check the current active status if (!ready) { command->notifyCmdQueue(); @@ -263,11 +220,19 @@ void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream // Check if we have to wait anything if (eventWaitList.size() > 0 || submitMarker) { - amd::Command* command = new amd::Marker(*blocking_queue, kMarkerDisableFlush, eventWaitList); + amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList); if (command != nullptr) { command->enqueue(); command->release(); } + + //Reset the dirty flag for all streams now that the marker is submitted + for (const auto& stream : streamSet) { + amd::HostQueue* active_queue = stream->asHostQueue(); + if (active_queue->vdev()->isFenceDirty()) { + active_queue->vdev()->resetFenceDirty(); + } + } } // Release all active commands. It's safe after the marker was enqueued @@ -292,8 +257,11 @@ static hipError_t ihipStreamCreate(hipStream_t* stream, } hip::Stream* hStream = new hip::Stream(hip::getCurrentDevice(), priority, flags, false, cuMask); - if (hStream == nullptr || !hStream->Create()) { - delete hStream; + if (hStream == nullptr) { + return hipErrorOutOfMemory; + } + else if (!hStream->Create()) { + hStream->release(); return hipErrorOutOfMemory; } @@ -314,7 +282,7 @@ stream_per_thread::stream_per_thread() { stream_per_thread::~stream_per_thread() { for (auto &stream:m_streams) { if (stream != nullptr && hip::isValid(stream)) { - delete reinterpret_cast(stream); + reinterpret_cast(stream)->release(); stream = nullptr; } } @@ -453,7 +421,7 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) { } } // Wait for the current host queue - hip::getQueue(stream)->finish(); + hip::getStream(stream)->finish(); return hipSuccess; } @@ -502,7 +470,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) { if (l_it != hip::tls.capture_streams_.end()) { hip::tls.capture_streams_.erase(l_it); } - delete s; + s->release(); HIP_RETURN(hipSuccess); } @@ -534,12 +502,6 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig return hipErrorContextIsDestroyed; } - if (stream != nullptr) { - // If still capturing return error - if (hip::Stream::StreamCaptureOngoing() == true) { - HIP_RETURN(hipErrorStreamCaptureIsolation); - } - } hip::Event* e = reinterpret_cast(event); return e->streamWait(stream, flags); } @@ -568,9 +530,9 @@ hipError_t hipStreamQuery_common(hipStream_t stream) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } } - amd::HostQueue* hostQueue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); - amd::Command* command = hostQueue->getLastQueuedCommand(true); + amd::Command* command = hip_stream->getLastQueuedCommand(true); if (command == nullptr) { // Nothing was submitted to the queue return hipSuccess; @@ -608,13 +570,13 @@ hipError_t streamCallback_common(hipStream_t stream, StreamCallback* cbo, void* return hipErrorContextIsDestroyed; } - amd::HostQueue* hostQueue = hip::getQueue(stream); - amd::Command* last_command = hostQueue->getLastQueuedCommand(true); + hip::Stream* hip_stream = hip::getStream(stream); + amd::Command* last_command = hip_stream->getLastQueuedCommand(true); amd::Command::EventWaitList eventWaitList; if (last_command != nullptr) { eventWaitList.push_back(last_command); } - amd::Command* command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList); + amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, eventWaitList); if (command == nullptr) { return hipErrorInvalidValue; } @@ -634,7 +596,7 @@ hipError_t streamCallback_common(hipStream_t stream, StreamCallback* cbo, void* // Add the new barrier to stall the stream, until the callback is done eventWaitList.clear(); eventWaitList.push_back(command); - amd::Command* block_command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList); + amd::Command* block_command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, eventWaitList); if (block_command == nullptr) { return hipErrorInvalidValue; } @@ -695,7 +657,7 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD // ================================================================================================ hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData) { HIP_INIT_API(hipLaunchHostFunc, stream, fn, userData); - if (stream == nullptr) { + if (stream == nullptr && (hip::Stream::StreamCaptureOngoing() == true)) { HIP_RETURN(hipErrorStreamCaptureImplicit); } HIP_RETURN(hipLaunchHostFunc_common(stream, fn, userData)); @@ -833,3 +795,27 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32 } HIP_RETURN(hipSuccess); } + +// ================================================================================================ +hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) { + HIP_INIT_API(hipStreamGetDevice, stream, device); + + if (device == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + + if (!hip::isValid(stream)) { + return HIP_RETURN(hipErrorContextIsDestroyed); + } + + if (stream == nullptr) { // handle null stream + // null stream is associated with current device, return the device id associated with the + // current device + *device = hip::getCurrentDevice()->deviceId(); + } else { + getStreamPerThread(stream); + *device = reinterpret_cast(stream)->DeviceId(); + } + + HIP_RETURN(hipSuccess); +} diff --git a/src/hip_stream_ops.cpp b/src/hip_stream_ops.cpp index a3bed6cf..7032c4c6 100644 --- a/src/hip_stream_ops.cpp +++ b/src/hip_stream_ops.cpp @@ -69,11 +69,11 @@ hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void return hipErrorInvalidValue; } - amd::HostQueue* queue = hip::getQueue(stream); + hip::Stream* hip_stream = hip::getStream(stream); amd::Command::EventWaitList waitList; amd::StreamOperationCommand* command = - new amd::StreamOperationCommand(*queue, cmdType, waitList, *memory->asBuffer(), + new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(), value, mask, outFlags, offset, sizeBytes); if (command == nullptr) { diff --git a/src/hip_texture.cpp b/src/hip_texture.cpp index 610d93fe..8c443739 100644 --- a/src/hip_texture.cpp +++ b/src/hip_texture.cpp @@ -25,9 +25,6 @@ #include "hip_conversions.hpp" #include "platform/sampler.hpp" -hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - amd::HostQueue& queue, bool isAsync = false); - hipError_t ihipFree(void* ptr); struct __hip_texture { @@ -575,8 +572,8 @@ hipError_t hipBindTexture2D(size_t* offset, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t ihipBindTextureToArray(const textureReference* texref, @@ -624,8 +621,8 @@ hipError_t hipBindTextureToArray(const textureReference* texref, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t ihipBindTextureToMipmappedArray(const textureReference* texref, @@ -674,8 +671,8 @@ hipError_t hipBindTextureToMipmappedArray(const textureReference* texref, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipUnbindTexture(const textureReference* texref) { @@ -701,8 +698,8 @@ hipError_t hipBindTexture(size_t* offset, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, @@ -966,8 +963,8 @@ hipError_t hipTexRefSetArray(textureReference* texRef, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr, @@ -1049,8 +1046,8 @@ hipError_t hipTexRefSetAddress(size_t* ByteOffset, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipTexRefSetAddress2D(textureReference* texRef, @@ -1091,8 +1088,8 @@ hipError_t hipTexRefSetAddress2D(textureReference* texRef, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) { @@ -1454,8 +1451,8 @@ hipError_t hipTexRefSetMipmappedArray(textureReference* texRef, HIP_RETURN(err); } // Copy to device. - amd::HostQueue* queue = hip::getNullStream(); - HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue)); + hip::Stream* stream = hip::getNullStream(); + HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream)); } hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, diff --git a/src/hiprtc/cmake/HIPRTC.cmake b/src/hiprtc/cmake/HIPRTC.cmake index 19b59f36..521b0234 100644 --- a/src/hiprtc/cmake/HIPRTC.cmake +++ b/src/hiprtc/cmake/HIPRTC.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -29,6 +29,7 @@ function(get_hiprtc_macros HIPRTC_DEFINES) set(${HIPRTC_DEFINES} "#pragma clang diagnostic push\n\ #pragma clang diagnostic ignored \"-Wreserved-id-macro\"\n\ +#pragma clang diagnostic ignored \"-Wc++98-compat-pedantic\"\n\ #define __device__ __attribute__((device))\n\ #define __host__ __attribute__((host))\n\ #define __global__ __attribute__((global))\n\ @@ -51,7 +52,10 @@ function(get_hiprtc_macros HIPRTC_DEFINES) #pragma clang diagnostic pop\n\ #define HIP_INCLUDE_HIP_HIP_RUNTIME_H\n\ #define HIP_INCLUDE_HIP_HIP_FP16_H\n\ +#pragma clang diagnostic push\n\ +#pragma clang diagnostic ignored \"-Wreserved-macro-identifier\"\n\ #define _HIP_BFLOAT16_H_\n\ +#pragma clang diagnostic pop\n\ #define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H" PARENT_SCOPE) endfunction(get_hiprtc_macros) @@ -63,12 +67,15 @@ if(HIPRTC_ADD_MACROS) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_DEFINES}") FILE(READ "${HIPRTC_WARP_HEADER_FILE}" HIPRTC_WARP_HEADER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_WARP_HEADER}") +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-macro-identifier" FILE(READ "${HIPRTC_COOP_HELPER_FILE}" HIPRTC_COOP_HELPER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HELPER}") FILE(READ "${HIPRTC_COOP_HEADER_FILE}" HIPRTC_COOP_HEADER) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HEADER}") FILE(READ "${HIPRTC_UNSAFE_ATOMICS_FILE}" HIPRTC_UNSAFE_ATOMICS) FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_UNSAFE_ATOMICS}") +#pragma clang diagnostic pop endif() macro(generate_hiprtc_header HiprtcHeader) diff --git a/src/hiprtc/hiprtcComgrHelper.cpp b/src/hiprtc/hiprtcComgrHelper.cpp index b8dca86c..c3794c61 100644 --- a/src/hiprtc/hiprtcComgrHelper.cpp +++ b/src/hiprtc/hiprtcComgrHelper.cpp @@ -157,6 +157,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx90c"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX940: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx940"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1010: xnackSupported = true; sramEccSupported = false; @@ -172,6 +177,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1012"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1013: + xnackSupported = true; + sramEccSupported = false; + proc_name = "gfx1013"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1030: xnackSupported = false; sramEccSupported = false; @@ -192,6 +202,41 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = false; proc_name = "gfx1033"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX1034: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1034"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1035: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1035"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1036: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1036"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1100: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1100"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1101: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1101"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1102: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1102"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX1103: + xnackSupported = false; + sramEccSupported = false; + proc_name = "gfx1103"; + break; default: return false; } diff --git a/src/hiprtc/hiprtcInternal.cpp b/src/hiprtc/hiprtcInternal.cpp index dfff262c..188b42d8 100644 --- a/src/hiprtc/hiprtcInternal.cpp +++ b/src/hiprtc/hiprtcInternal.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2022 - Present Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -117,11 +117,6 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp compile_options_.reserve(20); // count of options below compile_options_.push_back("-O3"); -#ifdef HIPRTC_EARLY_INLINE - compile_options_.push_back("-mllvm"); - compile_options_.push_back("-amdgpu-early-inline-all"); -#endif - if (GPU_ENABLE_WGP_MODE) compile_options_.push_back("-mcumode"); if (!GPU_ENABLE_WAVE32_MODE) compile_options_.push_back("-mwavefrontsize64"); @@ -135,6 +130,8 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp compile_options_.push_back("hiprtc_runtime.h"); compile_options_.push_back("-std=c++14"); compile_options_.push_back("-nogpuinc"); + compile_options_.push_back("-Wno-gnu-line-marker"); + compile_options_.push_back("-Wno-missing-prototypes"); #ifdef _WIN32 compile_options_.push_back("-target"); compile_options_.push_back("x86_64-pc-windows-msvc"); @@ -186,7 +183,23 @@ bool RTCCompileProgram::addBuiltinHeader() { return true; } -bool RTCCompileProgram::transformOptions() { +bool RTCCompileProgram::findLLVMOptions(const std::vector& options, + std::vector& llvm_options) { + for (size_t i = 0; i < options.size(); ++i) { + if (options[i] == "-mllvm") { + if (options.size() == (i+1)) { + LogInfo( + "-mllvm option passed by the app, it comes as a pair but there is no option after this"); + return false; + } + llvm_options.push_back(options[i]); + llvm_options.push_back(options[i + 1]); + } + } + return true; +} + +bool RTCCompileProgram::transformOptions(std::vector& compile_options) { auto getValueOf = [](const std::string& option) { std::string res; auto f = std::find(option.begin(), option.end(), '='); @@ -194,7 +207,7 @@ bool RTCCompileProgram::transformOptions() { return res; }; - for (auto& i : compile_options_) { + for (auto& i : compile_options) { if (i == "-hip-pch") { LogInfo( "-hip-pch is deprecated option, has no impact on execution of new hiprtc programs, it " @@ -216,9 +229,9 @@ bool RTCCompileProgram::transformOptions() { } if (auto res = std::find_if( - compile_options_.begin(), compile_options_.end(), + compile_options.begin(), compile_options.end(), [](const std::string& str) { return str.find("--offload-arch=") != std::string::npos; }); - res != compile_options_.end()) { + res != compile_options.end()) { auto isaName = getValueOf(*res); isa_ = "amdgcn-amd-amdhsa--" + isaName; settings_.offloadArchProvided = true; @@ -240,15 +253,21 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg fgpu_rdc_ = fgpu_rdc; // Append compile options - compile_options_.reserve(compile_options_.size() + options.size()); - compile_options_.insert(compile_options_.end(), options.begin(), options.end()); + std::vector compileOpts(compile_options_); + compileOpts.reserve(compile_options_.size() + options.size() + 2); + compileOpts.insert(compileOpts.end(), options.begin(), options.end()); + + if (!fgpu_rdc_) { + compileOpts.push_back("-Xclang"); + compileOpts.push_back("-disable-llvm-passes"); + } - if (!transformOptions()) { + if (!transformOptions(compileOpts)) { LogError("Error in hiprtc: unable to transform options"); return false; } - if (!compileToBitCode(compile_input_, isa_, compile_options_, build_log_, LLVMBitcode_)) { + if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { LogError("Error in hiprtc: unable to compile source to bitcode"); return false; } @@ -285,14 +304,30 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg return false; } + std::vector llvmOptions; + // Find the -mllvm options passed by the app such as "-mllvm" "-amdgpu-early-inline-all=true" + if (!findLLVMOptions(options, llvmOptions)) { + LogError("Error in hiprtc: unable to match -mllvm options"); + return false; + } + + std::vector exeOpts(exe_options_); + exeOpts.reserve(exeOpts.size() + llvmOptions.size() + 2); + // Add these options by default for optimizations during BC to Relocatable phase. + exeOpts.push_back("-mllvm"); + exeOpts.push_back("-amdgpu-internalize-symbols"); + // User provided -mllvm options are appended at the end since they can override the above + // default options if necessary + exeOpts.insert(exeOpts.end(), llvmOptions.begin(), llvmOptions.end()); + if (settings_.dumpISA) { - if (!dumpIsaFromBC(exec_input_, isa_, exe_options_, name_, build_log_)) { + if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) { LogError("Error in hiprtc: unable to dump isa code"); return false; } } - if (!createExecutable(exec_input_, isa_, exe_options_, build_log_, executable_)) { + if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) { LogError("Error in hiprtc: unable to create executable"); return false; } diff --git a/src/hiprtc/hiprtcInternal.hpp b/src/hiprtc/hiprtcInternal.hpp index 8b846e49..28fcc90f 100644 --- a/src/hiprtc/hiprtcInternal.hpp +++ b/src/hiprtc/hiprtcInternal.hpp @@ -146,7 +146,9 @@ class RTCCompileProgram : public RTCProgram { // Private Member functions bool addSource_impl(); bool addBuiltinHeader(); - bool transformOptions(); + bool transformOptions(std::vector& compile_options); + bool findLLVMOptions(const std::vector& options, + std::vector& llvm_options); RTCCompileProgram() = delete; RTCCompileProgram(RTCCompileProgram&) = delete;