Promote till commit '7f83be52c996287aa96aaa626abf372c333a6eb1'

Change-Id: Ia4d539c7f3190a10d0ccf61692f094b34c807004
ROCm · Mar 15, 2023 · d1e0ee9 · d1e0ee9
2 parents 6873466 + 7f83be5
commit d1e0ee9
Show file tree

Hide file tree

Showing 52 changed files with 1,400 additions and 1,114 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -447,9 +447,7 @@ if(${RUN_HIT} EQUAL 0)
     execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 endif()
 if(HIP_CATCH_TEST EQUAL "1")
-    enable_testing()
-    set(HIP_PATH ${HIP_ROOT_DIR})
-    add_subdirectory(${HIP_COMMON_DIR}/tests/catch ${PROJECT_BINARY_DIR}/catch)
+	message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ")
 else()
     if(${RUN_HIT} EQUAL 0)
         set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH})

diff --git a/hip-config.cmake.in b/hip-config.cmake.in
@@ -206,16 +206,6 @@ if(HIP_COMPILER STREQUAL "clang")
 
   if (NOT compilePropIsSet)
     hip_add_interface_compile_flags(hip::device -x hip)
-    if (NOT EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode)
-      # This path is to support an older build of the device library
-      # TODO: To be removed in the future.
-      if(WIN32)
-        hip_add_interface_compile_flags(hip::device -fms-extensions -fms-compatibility)
-        hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${HIP_PATH}/lib/bitcode\")
-      else()
-        hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${AMD_DEVICE_LIBS_PREFIX}/lib\")
-      endif()
-    endif()
   endif()
 
   hip_add_interface_link_flags(hip::device --hip-link)

diff --git a/include/hip/amd_detail/amd_device_functions.h b/include/hip/amd_detail/amd_device_functions.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -444,7 +444,8 @@ __device__ static inline unsigned long long int __double2ull_ru(double x) {
 __device__ static inline unsigned long long int __double2ull_rz(double x) {
   return (unsigned long long int)x;
 }
-
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
 __device__ static inline long long int __double_as_longlong(double x) {
     static_assert(sizeof(long long) == sizeof(double), "");
 
@@ -453,6 +454,7 @@ __device__ static inline long long int __double_as_longlong(double x) {
 
     return tmp;
 }
+#pragma clang diagnostic pop
 
 /*
 __device__ unsigned short __float2half_rn(float x);
@@ -775,21 +777,21 @@ __device__
 inline
 static void __threadfence()
 {
-  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
 }
 
 __device__
 inline
 static void __threadfence_block()
 {
-  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 }
 
 __device__
 inline
 static void __threadfence_system()
 {
-  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
 }
 
 // abort
@@ -862,14 +864,11 @@ void __assertfail()
 }
 #endif /* defined(_WIN32) || defined(_WIN64) */
 
-__device__
-inline
-static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
-{
+__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
     if (flags) {
-        __atomic_work_item_fence(flags, __memory_order_release, scope);
+        __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
         __builtin_amdgcn_s_barrier();
-        __atomic_work_item_fence(flags, __memory_order_acquire, scope);
+        __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
     } else {
         __builtin_amdgcn_s_barrier();
     }
@@ -879,7 +878,7 @@ __device__
 inline
 static void __barrier(int n)
 {
-  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
+  __work_group_barrier((__cl_mem_fence_flags)n);
 }
 
 __device__
@@ -922,7 +921,7 @@ int __syncthreads_or(int predicate)
    PIPE_ID     7:6     Pipeline from which the wave was dispatched.
    CU_ID       11:8    Compute Unit the wave is assigned to.
    SH_ID       12      Shader Array (within an SE) the wave is assigned to.
-   SE_ID       14:13   Shader Engine the wave is assigned to.
+   SE_ID       15:13   Shader Engine the wave is assigned to.
    TG_ID       19:16   Thread-group ID
    VM_ID       23:20   Virtual Memory ID
    QUEUE_ID    26:24   Queue from which this wave was dispatched.
@@ -935,7 +934,7 @@ int __syncthreads_or(int predicate)
 #define HW_ID_CU_ID_SIZE    4
 #define HW_ID_CU_ID_OFFSET  8
 
-#define HW_ID_SE_ID_SIZE    2
+#define HW_ID_SE_ID_SIZE    3
 #define HW_ID_SE_ID_OFFSET  13
 
 /*

diff --git a/include/hip/amd_detail/amd_hip_bfloat16.h b/include/hip/amd_detail/amd_hip_bfloat16.h
@@ -32,10 +32,8 @@
 #include "host_defines.h"
 #if defined(__HIPCC_RTC__)
     #define __HOST_DEVICE__ __device__
-    #define HIP_OSTREAM __hip_internal::ostream
 #else
     #define __HOST_DEVICE__ __host__ __device__
-    #define HIP_OSTREAM std::ostream
 #endif
 
 #if __cplusplus < 201103L || !defined(__HIPCC__)
@@ -181,12 +179,12 @@ static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
 static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
                   && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
               "internal hip_bfloat16 does not match public hip_bfloat16");
-#endif
 
-inline HIP_OSTREAM& operator<<(HIP_OSTREAM& os, const hip_bfloat16& bf16)
+inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
 {
-    return os << bf16;
+  return os << float(bf16);
 }
+#endif
 
 inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
 {

diff --git a/include/hip/amd_detail/amd_hip_complex.h b/include/hip/amd_detail/amd_hip_complex.h
@@ -106,15 +106,20 @@ THE SOFTWARE.
         return lhs;                                                                                \
     }
 
-#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
-    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {               \
-        lhs = lhs * rhs;                                                                           \
-        return lhs;                                                                                \
+#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                            \
+    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {                    \
+        type temp{lhs};                                                                             \
+        lhs.x = rhs.x * temp.x - rhs.y * temp.y;                                                    \
+        lhs.y = rhs.y * temp.x + rhs.x * temp.y;                                                    \
+        return lhs;                                                                                 \
     }
 
 #define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
-    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {               \
-        lhs = lhs / rhs;                                                                           \
+    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {                   \
+        type temp;                                                                                 \
+        temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                      \
+        temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                    \
+        lhs = temp;                                                                                \
         return lhs;                                                                                \
     }
 

diff --git a/include/hip/amd_detail/amd_hip_cooperative_groups.h b/include/hip/amd_detail/amd_hip_cooperative_groups.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,6 +32,13 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wpadded"
+
 #if __cplusplus
 #if !defined(__HIPCC_RTC__)
 #include <hip/amd_detail/hip_cooperative_groups_helper.h>
@@ -69,8 +76,8 @@ class thread_group {
   // only when the group is supposed to contain only the calling the thread
   // (throurh the API - `this_thread()`), and in all other cases, this thread
   // group object is a sub-object of some other derived thread group object
-  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = (uint64_t)0,
-                                uint64_t mask = (uint64_t)0) {
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
+                                uint64_t mask = static_cast<uint64_t>(0)) {
     _type = type;
     _size = size;
     _mask = mask;
@@ -199,7 +206,7 @@ class thread_block : public thread_group {
     const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
     // Invalid tile size, assert
     if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
-      __hip_assert(false && "invalid tile size");
+      __hip_assert(false && "invalid tile size")
     }
 
     thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
@@ -246,7 +253,7 @@ class tiled_group : public thread_group {
     const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
 
     if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
-      __hip_assert(false && "invalid tile size");
+      __hip_assert(false && "invalid tile size")
     }
 
     if (size() <= tile_size) {
@@ -282,7 +289,6 @@ class tiled_group : public thread_group {
  *  \details Represents a active thread group in a wavefront.
  *           This group type also supports sub-wave level intrinsics.
  */
-
 class coalesced_group : public thread_group {
  private:
   friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
@@ -300,8 +306,8 @@ class coalesced_group : public thread_group {
     // prepare a mask for further partitioning it so that it stays coalesced.
     if (coalesced_info.tiled_info.is_tiled) {
       unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
-      unsigned int masklength = min((unsigned int)size() - base_offset, tile_size);
-      lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
+      unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
+      lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
 
       member_mask <<= (__lane_id() & ~(tile_size - 1));
       coalesced_group coalesced_tile = coalesced_group(member_mask);
@@ -358,7 +364,7 @@ class coalesced_group : public thread_group {
   __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
     static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
 
-    srcRank = srcRank % size();
+    srcRank = srcRank % static_cast<int>(size());
 
     int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
              : (__AMDGCN_WAVEFRONT_SIZE == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
@@ -452,7 +458,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
       return (static_cast<const coalesced_group*>(this)->thread_rank());
     }
     default: {
-      __hip_assert(false && "invalid cooperative group type");
+      __hip_assert(false && "invalid cooperative group type")
       return -1;
     }
   }
@@ -476,7 +482,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const {
       return (static_cast<const coalesced_group*>(this)->is_valid());
     }
     default: {
-      __hip_assert(false && "invalid cooperative group type");
+      __hip_assert(false && "invalid cooperative group type")
       return false;
     }
   }
@@ -505,7 +511,7 @@ __CG_QUALIFIER__ void thread_group::sync() const {
       break;
     }
     default: {
-      __hip_assert(false && "invalid cooperative group type");
+      __hip_assert(false && "invalid cooperative group type")
     }
   }
 }
@@ -697,6 +703,6 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
   return impl::tiled_partition_internal<size, ParentCGTy>(g);
 }
 }  // namespace cooperative_groups
-
+#pragma clang diagnostic pop
 #endif  // __cplusplus
 #endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h
@@ -1569,7 +1569,7 @@ THE SOFTWARE.
             __half hrcp(__half x)
             {
                 return __half_raw{
-                    __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
+                    static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
             }
             inline
             __device__
@@ -1672,7 +1672,10 @@ THE SOFTWARE.
             __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
             inline
             __HOST_DEVICE__
-            __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
+            __half2 h2rcp(__half2 x) {
+                return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
+                                  static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
+            }
             inline
             __HOST_DEVICE__
             __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
@@ -1707,6 +1710,66 @@ THE SOFTWARE.
             using half = __half;
             using half2 = __half2;
         #endif
+        #if !defined(__HIPCC_RTC__)
+        #include "amd_device_functions.h"
+        #include "amd_warp_functions.h"
+            __device__
+            inline
+            __half __shfl(__half var, int src_lane, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl(tmp.i, src_lane, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl(tmp.i, src_lane, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl_up(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl_up(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl_down(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl_down(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half __shfl_xor(__half var,  int lane_mask, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl_xor(__half2 var,  int lane_mask, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+                return tmp.h;
+            }
+        #endif // !defined(__HIPCC_RTC__)
     #endif // defined(__cplusplus)
 #elif defined(__GNUC__)
     #include "hip_fp16_gcc.h"