Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Commit

Permalink
Promote till commit '7f83be52c996287aa96aaa626abf372c333a6eb1'
Browse files Browse the repository at this point in the history
Change-Id: Ia4d539c7f3190a10d0ccf61692f094b34c807004
  • Loading branch information
mangupta committed Mar 15, 2023
2 parents 6873466 + 7f83be5 commit d1e0ee9
Show file tree
Hide file tree
Showing 52 changed files with 1,400 additions and 1,114 deletions.
4 changes: 1 addition & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -447,9 +447,7 @@ if(${RUN_HIT} EQUAL 0)
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
endif()
if(HIP_CATCH_TEST EQUAL "1")
enable_testing()
set(HIP_PATH ${HIP_ROOT_DIR})
add_subdirectory(${HIP_COMMON_DIR}/tests/catch ${PROJECT_BINARY_DIR}/catch)
message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ")
else()
if(${RUN_HIT} EQUAL 0)
set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH})
Expand Down
10 changes: 0 additions & 10 deletions hip-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -206,16 +206,6 @@ if(HIP_COMPILER STREQUAL "clang")

if (NOT compilePropIsSet)
hip_add_interface_compile_flags(hip::device -x hip)
if (NOT EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode)
# This path is to support an older build of the device library
# TODO: To be removed in the future.
if(WIN32)
hip_add_interface_compile_flags(hip::device -fms-extensions -fms-compatibility)
hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${HIP_PATH}/lib/bitcode\")
else()
hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${AMD_DEVICE_LIBS_PREFIX}/lib\")
endif()
endif()
endif()

hip_add_interface_link_flags(hip::device --hip-link)
Expand Down
27 changes: 13 additions & 14 deletions include/hip/amd_detail/amd_device_functions.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -444,7 +444,8 @@ __device__ static inline unsigned long long int __double2ull_ru(double x) {
__device__ static inline unsigned long long int __double2ull_rz(double x) {
return (unsigned long long int)x;
}

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
__device__ static inline long long int __double_as_longlong(double x) {
static_assert(sizeof(long long) == sizeof(double), "");

Expand All @@ -453,6 +454,7 @@ __device__ static inline long long int __double_as_longlong(double x) {

return tmp;
}
#pragma clang diagnostic pop

/*
__device__ unsigned short __float2half_rn(float x);
Expand Down Expand Up @@ -775,21 +777,21 @@ __device__
inline
static void __threadfence()
{
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
}

__device__
inline
static void __threadfence_block()
{
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}

__device__
inline
static void __threadfence_system()
{
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
}

// abort
Expand Down Expand Up @@ -862,14 +864,11 @@ void __assertfail()
}
#endif /* defined(_WIN32) || defined(_WIN64) */

__device__
inline
static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
{
__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
if (flags) {
__atomic_work_item_fence(flags, __memory_order_release, scope);
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
__builtin_amdgcn_s_barrier();
__atomic_work_item_fence(flags, __memory_order_acquire, scope);
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
} else {
__builtin_amdgcn_s_barrier();
}
Expand All @@ -879,7 +878,7 @@ __device__
inline
static void __barrier(int n)
{
__work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
__work_group_barrier((__cl_mem_fence_flags)n);
}

__device__
Expand Down Expand Up @@ -922,7 +921,7 @@ int __syncthreads_or(int predicate)
PIPE_ID 7:6 Pipeline from which the wave was dispatched.
CU_ID 11:8 Compute Unit the wave is assigned to.
SH_ID 12 Shader Array (within an SE) the wave is assigned to.
SE_ID 14:13 Shader Engine the wave is assigned to.
SE_ID 15:13 Shader Engine the wave is assigned to.
TG_ID 19:16 Thread-group ID
VM_ID 23:20 Virtual Memory ID
QUEUE_ID 26:24 Queue from which this wave was dispatched.
Expand All @@ -935,7 +934,7 @@ int __syncthreads_or(int predicate)
#define HW_ID_CU_ID_SIZE 4
#define HW_ID_CU_ID_OFFSET 8

#define HW_ID_SE_ID_SIZE 2
#define HW_ID_SE_ID_SIZE 3
#define HW_ID_SE_ID_OFFSET 13

/*
Expand Down
8 changes: 3 additions & 5 deletions include/hip/amd_detail/amd_hip_bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@
#include "host_defines.h"
#if defined(__HIPCC_RTC__)
#define __HOST_DEVICE__ __device__
#define HIP_OSTREAM __hip_internal::ostream
#else
#define __HOST_DEVICE__ __host__ __device__
#define HIP_OSTREAM std::ostream
#endif

#if __cplusplus < 201103L || !defined(__HIPCC__)
Expand Down Expand Up @@ -181,12 +179,12 @@ static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
&& offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
"internal hip_bfloat16 does not match public hip_bfloat16");
#endif

inline HIP_OSTREAM& operator<<(HIP_OSTREAM& os, const hip_bfloat16& bf16)
inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
{
return os << bf16;
return os << float(bf16);
}
#endif

inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
{
Expand Down
17 changes: 11 additions & 6 deletions include/hip/amd_detail/amd_hip_complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,20 @@ THE SOFTWARE.
return lhs; \
}

#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
lhs = lhs * rhs; \
return lhs; \
#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
type temp{lhs}; \
lhs.x = rhs.x * temp.x - rhs.y * temp.y; \
lhs.y = rhs.y * temp.x + rhs.x * temp.y; \
return lhs; \
}

#define COMPLEX_DIV_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
lhs = lhs / rhs; \
__HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
type temp; \
temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
lhs = temp; \
return lhs; \
}

Expand Down
32 changes: 19 additions & 13 deletions include/hip/amd_detail/amd_hip_cooperative_groups.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -32,6 +32,13 @@ THE SOFTWARE.
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc++98-compat"
#pragma clang diagnostic ignored "-Wsign-conversion"
#pragma clang diagnostic ignored "-Wunused-parameter"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wpadded"

#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/hip_cooperative_groups_helper.h>
Expand Down Expand Up @@ -69,8 +76,8 @@ class thread_group {
// only when the group is supposed to contain only the calling the thread
// (throurh the API - `this_thread()`), and in all other cases, this thread
// group object is a sub-object of some other derived thread group object
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = (uint64_t)0,
uint64_t mask = (uint64_t)0) {
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
uint64_t mask = static_cast<uint64_t>(0)) {
_type = type;
_size = size;
_mask = mask;
Expand Down Expand Up @@ -199,7 +206,7 @@ class thread_block : public thread_group {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
// Invalid tile size, assert
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
__hip_assert(false && "invalid tile size");
__hip_assert(false && "invalid tile size")
}

thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
Expand Down Expand Up @@ -246,7 +253,7 @@ class tiled_group : public thread_group {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);

if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
__hip_assert(false && "invalid tile size");
__hip_assert(false && "invalid tile size")
}

if (size() <= tile_size) {
Expand Down Expand Up @@ -282,7 +289,6 @@ class tiled_group : public thread_group {
* \details Represents a active thread group in a wavefront.
* This group type also supports sub-wave level intrinsics.
*/

class coalesced_group : public thread_group {
private:
friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
Expand All @@ -300,8 +306,8 @@ class coalesced_group : public thread_group {
// prepare a mask for further partitioning it so that it stays coalesced.
if (coalesced_info.tiled_info.is_tiled) {
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
unsigned int masklength = min((unsigned int)size() - base_offset, tile_size);
lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);

member_mask <<= (__lane_id() & ~(tile_size - 1));
coalesced_group coalesced_tile = coalesced_group(member_mask);
Expand Down Expand Up @@ -358,7 +364,7 @@ class coalesced_group : public thread_group {
__CG_QUALIFIER__ T shfl(T var, int srcRank) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");

srcRank = srcRank % size();
srcRank = srcRank % static_cast<int>(size());

int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
: (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
Expand Down Expand Up @@ -452,7 +458,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
return (static_cast<const coalesced_group*>(this)->thread_rank());
}
default: {
__hip_assert(false && "invalid cooperative group type");
__hip_assert(false && "invalid cooperative group type")
return -1;
}
}
Expand All @@ -476,7 +482,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const {
return (static_cast<const coalesced_group*>(this)->is_valid());
}
default: {
__hip_assert(false && "invalid cooperative group type");
__hip_assert(false && "invalid cooperative group type")
return false;
}
}
Expand Down Expand Up @@ -505,7 +511,7 @@ __CG_QUALIFIER__ void thread_group::sync() const {
break;
}
default: {
__hip_assert(false && "invalid cooperative group type");
__hip_assert(false && "invalid cooperative group type")
}
}
}
Expand Down Expand Up @@ -697,6 +703,6 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
return impl::tiled_partition_internal<size, ParentCGTy>(g);
}
} // namespace cooperative_groups

#pragma clang diagnostic pop
#endif // __cplusplus
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
67 changes: 65 additions & 2 deletions include/hip/amd_detail/amd_hip_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -1569,7 +1569,7 @@ THE SOFTWARE.
__half hrcp(__half x)
{
return __half_raw{
__llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
}
inline
__device__
Expand Down Expand Up @@ -1672,7 +1672,10 @@ THE SOFTWARE.
__half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
inline
__HOST_DEVICE__
__half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
__half2 h2rcp(__half2 x) {
return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
}
inline
__HOST_DEVICE__
__half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
Expand Down Expand Up @@ -1707,6 +1710,66 @@ THE SOFTWARE.
using half = __half;
using half2 = __half2;
#endif
#if !defined(__HIPCC_RTC__)
#include "amd_device_functions.h"
#include "amd_warp_functions.h"
__device__
inline
__half __shfl(__half var, int src_lane, int width = warpSize) {
union { int i; __half h; } tmp; tmp.h = var;
tmp.i = __shfl(tmp.i, src_lane, width);
return tmp.h;
}
__device__
inline
__half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
union { int i; __half2 h; } tmp; tmp.h = var;
tmp.i = __shfl(tmp.i, src_lane, width);
return tmp.h;
}
__device__
inline
__half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
union { int i; __half h; } tmp; tmp.h = var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return tmp.h;
}
__device__
inline
__half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
union { int i; __half2 h; } tmp; tmp.h = var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return tmp.h;
}
__device__
inline
__half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
union { int i; __half h; } tmp; tmp.h = var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return tmp.h;
}
__device__
inline
__half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
union { int i; __half2 h; } tmp; tmp.h = var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return tmp.h;
}
__device__
inline
__half __shfl_xor(__half var, int lane_mask, int width = warpSize) {
union { int i; __half h; } tmp; tmp.h = var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return tmp.h;
}
__device__
inline
__half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) {
union { int i; __half2 h; } tmp; tmp.h = var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return tmp.h;
}
#endif // !defined(__HIPCC_RTC__)
#endif // defined(__cplusplus)
#elif defined(__GNUC__)
#include "hip_fp16_gcc.h"
Expand Down
Loading

0 comments on commit d1e0ee9

Please sign in to comment.